Compare commits
121 Commits
phase-3-pr
...
phase-6-1-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
1d9008e354 | ||
|
|
ef6b0bb8fc | ||
| a06fcb16a2 | |||
|
|
d2f3a243cd | ||
|
|
29bcaf277b | ||
|
|
b6d2803ff6 | ||
|
|
f3850f8914 | ||
|
|
90f7792c92 | ||
|
|
c04b13f436 | ||
| 6a30f3dde7 | |||
|
|
ba31f200f6 | ||
| 81a1f7f0f6 | |||
|
|
4695a5c88e | ||
| 0109fab4bf | |||
|
|
c9e856178a | ||
| 63eb569fd6 | |||
|
|
fad04bbdf7 | ||
| 17f901bb65 | |||
|
|
ba3a5598e1 | ||
| 8cd932e7c9 | |||
|
|
28328def5d | ||
| d3bf544abc | |||
|
|
24435712c4 | ||
| 3f7b4d05e6 | |||
|
|
a79c5f3008 | ||
| a5299a2fee | |||
|
|
a65215684c | ||
| 82f2dfcfa3 | |||
|
|
0433d3a35e | ||
| 141673fc80 | |||
|
|
db56a95819 | ||
| 89bd726fa8 | |||
|
|
238748bc98 | ||
| b21d550836 | |||
|
|
91eaf534c8 | ||
| d33e38e059 | |||
|
|
d8ef35d5bd | ||
| 5e318a1ab6 | |||
|
|
394d126b2e | ||
| 0eab1271be | |||
|
|
d5034c40f7 | ||
| 5e67c49f7c | |||
|
|
0575280a3b | ||
| 8150177296 | |||
|
|
56d8af8bdb | ||
| be8261a4ac | |||
| 65de2b4a09 | |||
| fccb566a30 | |||
| 9ccc7338b8 | |||
| e33783e042 | |||
|
|
a44fc7a610 | ||
|
|
d4c1873998 | ||
|
|
f52b7d8979 | ||
|
|
b54724a812 | ||
|
|
10c724b5b6 | ||
| 8c89d603e8 | |||
| 299bd4a932 | |||
|
|
c506ea298a | ||
|
|
9e2b5b330f | ||
| d5c6280333 | |||
| 476ce9b7c5 | |||
| 954bf55d28 | |||
| 9fb3cf7512 | |||
|
|
793c787315 | ||
|
|
cde018aec1 | ||
|
|
9892a0253d | ||
|
|
b5464f11ee | ||
| dae29f14c8 | |||
| f306793e36 | |||
| 9e61873cc0 | |||
| 1a60470d4a | |||
| 635f67bb02 | |||
|
|
a3f2f95344 | ||
|
|
463c5a4320 | ||
|
|
2b5222f5db | ||
|
|
8248b126ce | ||
|
|
cd19022d19 | ||
| 5ee9acb255 | |||
|
|
02fccbc762 | ||
| faeab34541 | |||
|
|
a05b84858d | ||
| c59ac9e52d | |||
|
|
02a0e8efd1 | ||
| 7009483d16 | |||
|
|
9de96554dc | ||
| af35fac0ef | |||
|
|
aa8834a231 | ||
| 976e73e051 | |||
|
|
8fb3dbe53b | ||
|
|
a61e637411 | ||
| e4885aadd0 | |||
|
|
52a29100b1 | ||
| 19bcf20fbe | |||
|
|
8adc8f5ab8 | ||
| 261869d84e | |||
|
|
08c90d19fd | ||
| 5cc120d836 | |||
|
|
bf329b05d8 | ||
| 2584379e75 | |||
|
|
ef2a810b2d | ||
| a7764e50f3 | |||
|
|
8464e3f376 | ||
| a9357600e7 | |||
|
|
2f00c74bbb | ||
| 5d5e1f9650 | |||
|
|
4886a5783f | ||
| d70a2e0077 | |||
|
|
cb7b81a87a | ||
| 901d2b8019 | |||
|
|
d5fa1f450e | ||
| 6fdaee3a71 | |||
|
|
ed88835d34 | ||
| 5389d4d22d | |||
|
|
b5f8661e98 | ||
| 4058b88784 | |||
|
|
6b04a85f86 | ||
| cd8691280a | |||
|
|
77d09bf64e | ||
| 163c821e74 | |||
|
|
eea31dcc4e | ||
| 8a692d4ba8 |
1
.gitignore
vendored
1
.gitignore
vendored
@@ -29,3 +29,4 @@ packages/
|
||||
# Claude Code (per-developer settings, runtime lock files, agent transcripts)
|
||||
.claude/
|
||||
|
||||
.local/
|
||||
|
||||
@@ -9,6 +9,8 @@
|
||||
<Project Path="src/ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Host/ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Host.csproj"/>
|
||||
<Project Path="src/ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Proxy/ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Proxy.csproj"/>
|
||||
<Project Path="src/ZB.MOM.WW.OtOpcUa.Driver.Modbus/ZB.MOM.WW.OtOpcUa.Driver.Modbus.csproj"/>
|
||||
<Project Path="src/ZB.MOM.WW.OtOpcUa.Driver.S7/ZB.MOM.WW.OtOpcUa.Driver.S7.csproj"/>
|
||||
<Project Path="src/ZB.MOM.WW.OtOpcUa.Driver.OpcUaClient/ZB.MOM.WW.OtOpcUa.Driver.OpcUaClient.csproj"/>
|
||||
<Project Path="src/ZB.MOM.WW.OtOpcUa.Client.Shared/ZB.MOM.WW.OtOpcUa.Client.Shared.csproj"/>
|
||||
<Project Path="src/ZB.MOM.WW.OtOpcUa.Client.CLI/ZB.MOM.WW.OtOpcUa.Client.CLI.csproj"/>
|
||||
<Project Path="src/ZB.MOM.WW.OtOpcUa.Client.UI/ZB.MOM.WW.OtOpcUa.Client.UI.csproj"/>
|
||||
@@ -21,9 +23,13 @@
|
||||
<Project Path="tests/ZB.MOM.WW.OtOpcUa.Admin.Tests/ZB.MOM.WW.OtOpcUa.Admin.Tests.csproj"/>
|
||||
<Project Path="tests/ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Shared.Tests/ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Shared.Tests.csproj"/>
|
||||
<Project Path="tests/ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Host.Tests/ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Host.Tests.csproj"/>
|
||||
<Project Path="tests/ZB.MOM.WW.OtOpcUa.Driver.Galaxy.TestSupport/ZB.MOM.WW.OtOpcUa.Driver.Galaxy.TestSupport.csproj"/>
|
||||
<Project Path="tests/ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Proxy.Tests/ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Proxy.Tests.csproj"/>
|
||||
<Project Path="tests/ZB.MOM.WW.OtOpcUa.Driver.Galaxy.E2E/ZB.MOM.WW.OtOpcUa.Driver.Galaxy.E2E.csproj"/>
|
||||
<Project Path="tests/ZB.MOM.WW.OtOpcUa.Driver.Modbus.Tests/ZB.MOM.WW.OtOpcUa.Driver.Modbus.Tests.csproj"/>
|
||||
<Project Path="tests/ZB.MOM.WW.OtOpcUa.Driver.Modbus.IntegrationTests/ZB.MOM.WW.OtOpcUa.Driver.Modbus.IntegrationTests.csproj"/>
|
||||
<Project Path="tests/ZB.MOM.WW.OtOpcUa.Driver.S7.Tests/ZB.MOM.WW.OtOpcUa.Driver.S7.Tests.csproj"/>
|
||||
<Project Path="tests/ZB.MOM.WW.OtOpcUa.Driver.OpcUaClient.Tests/ZB.MOM.WW.OtOpcUa.Driver.OpcUaClient.Tests.csproj"/>
|
||||
<Project Path="tests/ZB.MOM.WW.OtOpcUa.Client.Shared.Tests/ZB.MOM.WW.OtOpcUa.Client.Shared.Tests.csproj"/>
|
||||
<Project Path="tests/ZB.MOM.WW.OtOpcUa.Client.CLI.Tests/ZB.MOM.WW.OtOpcUa.Client.CLI.Tests.csproj"/>
|
||||
<Project Path="tests/ZB.MOM.WW.OtOpcUa.Client.UI.Tests/ZB.MOM.WW.OtOpcUa.Client.UI.Tests.csproj"/>
|
||||
|
||||
1
_p54.json
Normal file
1
_p54.json
Normal file
@@ -0,0 +1 @@
|
||||
{"title":"Phase 3 PR 54 -- Siemens S7 Modbus TCP quirks research doc","body":"## Summary\n\nAdds `docs/v2/s7.md` (485 lines) covering Siemens SIMATIC S7 family Modbus TCP behavior. Mirrors the `docs/v2/dl205.md` template for future per-quirk implementation PRs.\n\n## Key findings for the implementation track\n\n- **No fixed memory map** — every S7 Modbus server is user-wired via `MB_SERVER`/`MODBUSCP`/`MODBUSPN` library blocks. Driver must accept per-site config, not assume a vendor layout.\n- **MB_SERVER requires non-optimized DBs** (STATUS `0x8383` if optimized). Most common field bug.\n- **Word order default = ABCD** (opposite of DL260). Driver's S7 profile default must be `ByteOrder.BigEndian`, not `WordSwap`.\n- **One port per MB_SERVER instance** — multi-client requires parallel FBs on 503/504/… Most clients assume port 502 multiplexes (wrong on S7).\n- **CP 343-1 Lean is server-only**, requires the `2XV9450-1MB00` license.\n- **FC20/21/22/23/43 all return Illegal Function** on every S7 variant — driver must not attempt FC23 bulk-read optimization for S7.\n- **STOP-mode behavior non-deterministic** across firmware bands — treat both read/write STOP-mode responses as unavailable.\n\nTwo items flagged as unconfirmed rumour (V2.0+ float byte-order claim, STOP-mode caching location).\n\nNo code, no tests — implementation lands in PRs 56+.\n\n## Test plan\n- [x] Doc renders as markdown\n- [x] 31 citations present\n- [x] Section structure matches dl205.md template","head":"phase-3-pr54-s7-research-doc","base":"v2"}
|
||||
1
_p55.json
Normal file
1
_p55.json
Normal file
@@ -0,0 +1 @@
|
||||
{"title":"Phase 3 PR 55 -- Mitsubishi MELSEC Modbus TCP quirks research doc","body":"## Summary\n\nAdds `docs/v2/mitsubishi.md` (451 lines) covering MELSEC Q/L/iQ-R/iQ-F/FX3U Modbus TCP behavior. Mirrors `docs/v2/dl205.md` template for per-quirk implementation PRs.\n\n## Key findings for the implementation track\n\n- **Module naming trap** — `QJ71MB91` is SERIAL RTU, not TCP. TCP module is `QJ71MT91`. Surface clearly in driver docs.\n- **No canonical mapping** — per-site 'Modbus Device Assignment Parameter' block (up to 16 entries). Treat mapping as runtime config.\n- **X/Y hex vs octal depends on family** — Q/L/iQ-R use HEX (X20 = decimal 32); FX/iQ-F use OCTAL (X20 = decimal 16). Helper must take a family selector.\n- **Word order CDAB default** across all MELSEC families (opposite of Siemens S7). Driver Mitsubishi profile default: `ByteOrder.WordSwap`.\n- **D-registers binary by default** (opposite of DL205's BCD default). Caller opts in to `Bcd16`/`Bcd32` when ladder uses BCD.\n- **FX5U needs firmware ≥ 1.060** for Modbus TCP server — older is client-only.\n- **FX3U-ENET vs FX3U-ENET-P502 vs FX3U-ENET-ADP** — only the middle one binds port 502; the last has no Modbus at all. Common operator mis-purchase.\n- **QJ71MT91 does NOT support FC22 / FC23** — iQ-R / iQ-F do. Bulk-read optimization must gate on capability.\n- **STOP-mode writes configurable** on Q/L/iQ-R/iQ-F (default accept), always rejected on FX3U-ENET.\n\nThree unconfirmed rumours flagged separately.\n\nNo code, no tests — implementation lands in PRs 58+.\n\n## Test plan\n- [x] Doc renders as markdown\n- [x] 17 citations present\n- [x] Per-model test naming matrix included (`Mitsubishi_QJ71MT91_*`, `Mitsubishi_FX5U_*`, `Mitsubishi_FX3U_ENET_*`, shared `Mitsubishi_Common_*`)","head":"phase-3-pr55-mitsubishi-research-doc","base":"v2"}
|
||||
@@ -348,6 +348,44 @@ The project uses [GLAuth](https://github.com/glauth/glauth) v2.4.0 as the LDAP s
|
||||
|
||||
Enable LDAP in `appsettings.json` under `Authentication.Ldap`. See [Configuration Guide](Configuration.md) for the full property reference.
|
||||
|
||||
### Active Directory configuration
|
||||
|
||||
Production deployments typically point at Active Directory instead of GLAuth. Only four properties differ from the dev defaults: `Server`, `Port`, `UserNameAttribute`, and `ServiceAccountDn`. The same `GroupToRole` mechanism works — map your AD security groups to OPC UA roles.
|
||||
|
||||
```json
|
||||
{
|
||||
"OpcUaServer": {
|
||||
"Ldap": {
|
||||
"Enabled": true,
|
||||
"Server": "dc01.corp.example.com",
|
||||
"Port": 636,
|
||||
"UseTls": true,
|
||||
"AllowInsecureLdap": false,
|
||||
"SearchBase": "DC=corp,DC=example,DC=com",
|
||||
"ServiceAccountDn": "CN=OpcUaSvc,OU=Service Accounts,DC=corp,DC=example,DC=com",
|
||||
"ServiceAccountPassword": "<from your secret store>",
|
||||
"DisplayNameAttribute": "displayName",
|
||||
"GroupAttribute": "memberOf",
|
||||
"UserNameAttribute": "sAMAccountName",
|
||||
"GroupToRole": {
|
||||
"OPCUA-Operators": "WriteOperate",
|
||||
"OPCUA-Engineers": "WriteConfigure",
|
||||
"OPCUA-AlarmAck": "AlarmAck",
|
||||
"OPCUA-Tuners": "WriteTune"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Notes:
|
||||
|
||||
- `UserNameAttribute: "sAMAccountName"` is the critical AD override — the default `uid` is not populated on AD user entries, so the user-DN lookup returns no results without it. Use `userPrincipalName` instead if operators log in with `user@corp.example.com` form.
|
||||
- `Port: 636` + `UseTls: true` is required under AD's LDAP-signing enforcement. AD increasingly rejects plain-LDAP bind; set `AllowInsecureLdap: false` to refuse fallback.
|
||||
- `ServiceAccountDn` should name a dedicated read-only service principal — not a privileged admin. The account needs read access to user and group entries in the search base.
|
||||
- `memberOf` values come back as full DNs like `CN=OPCUA-Operators,OU=OPC UA Security Groups,OU=Groups,DC=corp,DC=example,DC=com`. The authenticator strips the leading `CN=` RDN value so operators configure `GroupToRole` with readable group common-names.
|
||||
- Nested group membership is **not** expanded — assign users directly to the role-mapped groups, or pre-flatten membership in AD. `LDAP_MATCHING_RULE_IN_CHAIN` / `tokenGroups` expansion is an authenticator enhancement, not a config change.
|
||||
|
||||
### Security Considerations
|
||||
|
||||
- LDAP credentials are transmitted in plaintext over the OPC UA channel unless transport security is enabled. Use `Basic256Sha256-SignAndEncrypt` for production deployments.
|
||||
|
||||
@@ -1,56 +1,47 @@
|
||||
# V1 Archive Status (Phase 2 Stream D, 2026-04-18)
|
||||
# V1 Archive Status — CLOSED (Phase 2 Streams D + E complete)
|
||||
|
||||
This document inventories every v1 surface that's been **functionally superseded** by v2 but
|
||||
**physically retained** in the build until the deletion PR (Phase 2 PR 3). Rationale: cascading
|
||||
references mean a single deletion is high blast-radius; archive-marking lets the v2 stack ship
|
||||
on its own merits while the v1 surface stays as parity reference.
|
||||
> **Status as of 2026-04-18: the v1 archive has been fully removed from the tree.**
|
||||
> This document is retained as historical record of the Phase 2 Stream D / E closure.
|
||||
|
||||
## Archived projects
|
||||
## Final state
|
||||
|
||||
| Path | Status | Replaced by | Build behavior |
|
||||
|---|---|---|---|
|
||||
| `src/ZB.MOM.WW.OtOpcUa.Host/` | Archive (executable in build) | `OtOpcUa.Server` + `Driver.Galaxy.Host` + `Driver.Galaxy.Proxy` | Builds; not deployed by v2 install scripts |
|
||||
| `src/ZB.MOM.WW.OtOpcUa.Historian.Aveva/` | Archive (plugin in build) | TODO: port into `Driver.Galaxy.Host/Backend/Historian/` (Task B.1.h follow-up) | Builds; loaded only by archived Host |
|
||||
| `tests/ZB.MOM.WW.OtOpcUa.Tests.v1Archive/` | Archive | `Driver.Galaxy.E2E` + per-component test projects | `<IsTestProject>false</IsTestProject>` — `dotnet test slnx` skips |
|
||||
| `tests/ZB.MOM.WW.OtOpcUa.IntegrationTests/` | Archive | `Driver.Galaxy.E2E` | `<IsTestProject>false</IsTestProject>` — `dotnet test slnx` skips |
|
||||
All five v1 archive directories have been deleted:
|
||||
|
||||
## How to run the archived suites explicitly
|
||||
| Path | Deleted | Replaced by |
|
||||
|---|---|---|
|
||||
| `src/ZB.MOM.WW.OtOpcUa.Host/` | ✅ | `OtOpcUa.Server` + `Driver.Galaxy.Host` + `Driver.Galaxy.Proxy` |
|
||||
| `src/ZB.MOM.WW.OtOpcUa.Historian.Aveva/` | ✅ | `Driver.Galaxy.Host/Backend/Historian/` (ported in Phase 3 PRs 51-55) |
|
||||
| `tests/ZB.MOM.WW.OtOpcUa.Historian.Aveva.Tests/` | ✅ | `Driver.Galaxy.Host.Tests/Historian/` |
|
||||
| `tests/ZB.MOM.WW.OtOpcUa.Tests.v1Archive/` | ✅ | Per-component `*.Tests` projects + `Driver.Galaxy.E2E` |
|
||||
| `tests/ZB.MOM.WW.OtOpcUa.IntegrationTests/` | ✅ | `Driver.Galaxy.E2E` + `Driver.Modbus.IntegrationTests` |
|
||||
|
||||
```powershell
|
||||
# v1 unit tests (494):
|
||||
dotnet test tests/ZB.MOM.WW.OtOpcUa.Tests.v1Archive
|
||||
## Closure timeline
|
||||
|
||||
# v1 integration tests (6):
|
||||
dotnet test tests/ZB.MOM.WW.OtOpcUa.IntegrationTests
|
||||
```
|
||||
- **PR 2 (2026-04-18, phase-2-stream-d)** — archive-marked the four v1 projects with
|
||||
`<IsTestProject>false</IsTestProject>` so solution builds and `dotnet test slnx` bypassed
|
||||
them. Capture: `docs/v2/implementation/exit-gate-phase-2-final.md`.
|
||||
- **Phase 3 PR 18 (2026-04-18)** — deleted the archived project source trees. Leftover
|
||||
`bin/` and `obj/` residue remained on disk from pre-deletion builds.
|
||||
- **Phase 2 PR 61 (2026-04-18, this closure PR)** — scrubbed the empty residue directories
|
||||
and confirmed `dotnet build ZB.MOM.WW.OtOpcUa.slnx` clean with 0 errors.
|
||||
|
||||
Both still pass on this dev box — they're the parity reference for Phase 2 PR 3's deletion
|
||||
decision.
|
||||
## Parity validation (Stream E)
|
||||
|
||||
## Deletion plan (Phase 2 PR 3)
|
||||
The original 494 v1 tests + 6 v1 integration tests are **not** preserved in the v2 branch.
|
||||
Their parity-bar role is now filled by:
|
||||
|
||||
Pre-conditions:
|
||||
- [ ] `Driver.Galaxy.E2E` test count covers the v1 IntegrationTests' 6 integration scenarios
|
||||
at minimum (currently 7 tests; expand as needed)
|
||||
- [ ] `Driver.Galaxy.Host/Backend/Historian/` ports the Wonderware Historian plugin
|
||||
so `MxAccessGalaxyBackend.HistoryReadAsync` returns real data (Task B.1.h)
|
||||
- [ ] Operator review on a separate PR — destructive change
|
||||
|
||||
Steps:
|
||||
1. `git rm -r src/ZB.MOM.WW.OtOpcUa.Host/`
|
||||
2. `git rm -r src/ZB.MOM.WW.OtOpcUa.Historian.Aveva/`
|
||||
(or move it under Driver.Galaxy.Host first if the lift is part of the same PR)
|
||||
3. `git rm -r tests/ZB.MOM.WW.OtOpcUa.Tests.v1Archive/`
|
||||
4. `git rm -r tests/ZB.MOM.WW.OtOpcUa.IntegrationTests/`
|
||||
5. Edit `ZB.MOM.WW.OtOpcUa.slnx` — remove the four project lines
|
||||
6. `dotnet build ZB.MOM.WW.OtOpcUa.slnx` → confirm clean
|
||||
7. `dotnet test ZB.MOM.WW.OtOpcUa.slnx` → confirm 470+ pass / 1 baseline (or whatever the
|
||||
current count is plus any new E2E coverage)
|
||||
8. Commit: "Phase 2 Stream D — delete v1 archive (Host + Historian.Aveva + v1Tests + IntegrationTests)"
|
||||
9. PR 3 against `v2`, link this doc + exit-gate-phase-2-final.md
|
||||
10. One reviewer signoff
|
||||
- `Driver.Galaxy.E2E` — cross-FX subprocess parity (spawns the net48 x86 Galaxy.Host.exe
|
||||
+ connects via real named pipe, exercises every `IDriver` capability through the
|
||||
supervisor). Stability-findings regression tests (4 × 2026-04-13 findings) live here.
|
||||
- Per-component `*.Tests` projects — cover the code that moved out of the monolith into
|
||||
discrete v2 projects. Running `dotnet test ZB.MOM.WW.OtOpcUa.slnx` executes all of them
|
||||
as one solution-level gate.
|
||||
- `Driver.Modbus.IntegrationTests` — adds Modbus TCP driver coverage that didn't exist in
|
||||
v1 (DL205, S7-1500, Mitsubishi MELSEC via pymodbus sim profiles — PRs 30, 56-60).
|
||||
- Live-stack smoke tests (`Driver.Galaxy.E2E/LiveStack/`) — optional, gated on presence
|
||||
of the `OtOpcUaGalaxyHost` service + Galaxy repository on the dev box (PRs 33, 36, 37).
|
||||
|
||||
## Rollback
|
||||
|
||||
If Phase 2 PR 3 surfaces downstream consumer regressions, `git revert` the deletion commit
|
||||
restores the four projects intact. The v2 stack continues to ship from the v2 branch.
|
||||
`git revert` of the deletion commits restores the projects intact. The v2 stack continues
|
||||
to ship from the `v2` branch regardless.
|
||||
|
||||
295
docs/v2/dl205.md
Normal file
295
docs/v2/dl205.md
Normal file
@@ -0,0 +1,295 @@
|
||||
# AutomationDirect DirectLOGIC DL205 / DL260 — Modbus quirks
|
||||
|
||||
AutomationDirect's DirectLOGIC DL205 family (D2-250-1, D2-260, D2-262, D2-262M) and
|
||||
its larger DL260 sibling speak Modbus TCP (via the H2-ECOM100 / H2-EBC100 Ethernet
|
||||
coprocessors, and the DL260's built-in Ethernet port) and Modbus RTU (via the CPU
|
||||
serial ports in "Modbus" mode). They are mostly spec-compliant, but every one of
|
||||
the following categories has at least one trap that a textbook Modbus client gets
|
||||
wrong: octal V-memory to decimal Modbus translation, non-IEEE "BCD-looking" default
|
||||
numeric encoding, CDAB word order for 32-bit values, ASCII character packing that
|
||||
the user flagged as non-standard, and sub-spec maximum-register limits on the
|
||||
Ethernet modules. This document catalogues each quirk, cites primary sources, and
|
||||
names the ModbusPal integration test we'd write for it (convention from
|
||||
`docs/v2/modbus-test-plan.md`: `DL205_<behavior>`).
|
||||
|
||||
## Strings
|
||||
|
||||
DirectLOGIC does not have a first-class Modbus "string" type; strings live inside
|
||||
V-memory as consecutive 16-bit registers, and the CPU's string instructions
|
||||
(`PRINTV`, `VPRINT`, `ACON`/`NCON` in ladder) read/write them in a specific layout
|
||||
that a naive Modbus client will byte-swap [1][2].
|
||||
|
||||
- **Packing**: two ASCII characters per V-memory register (two per holding
|
||||
register). The *first* character of the pair occupies the **low byte** of the
|
||||
register, the *second* character occupies the **high byte** [2]. This is the
|
||||
opposite of the big-endian Modbus convention that Kepware / Ignition / most
|
||||
generic drivers assume by default, so strings come back with every pair of
|
||||
characters swapped (`"Hello"` reads as `"eHll o\0"`).
|
||||
- **Termination**: null-terminated (`0x00` in the character byte). There is no
|
||||
length prefix. Writes must pad the final register's unused byte with `0x00`.
|
||||
- **Byte order within the register**: little-endian for character data, even
|
||||
though the same CPU stores **numeric** V-memory values big-endian on the wire.
|
||||
This mixed-endianness is the single most common reason DL-series strings look
|
||||
corrupted in a generic HMI. Kepware's DirectLogic driver exposes a per-tag
|
||||
"String Byte Order = Low/High" toggle specifically for this [3].
|
||||
- **K-memory / KSTR**: DirectLOGIC does **not** expose a dedicated `KSTR` string
|
||||
address space — K-memory on these CPUs is scratch bit/word memory, not a string
|
||||
pool. Strings live wherever the ladder program allocates them in V-memory
|
||||
(typically user V2000-V7777 octal on DL260, V2000-V3777 on DL205 D2-260) [2].
|
||||
- **Maximum length**: bounded only by the V-memory region assigned. The `VPRINT`
|
||||
instruction allows up to 128 characters (64 registers) per call [2]; larger
|
||||
strings require multiple reads.
|
||||
- **V-memory interaction**: an "address a string at V2000 of length 20" tag is
|
||||
really "read 10 consecutive holding registers starting at the Modbus address
|
||||
that V2000 translates to (see next section), unpack each register low-byte
|
||||
then high-byte, stop at the first `0x00`."
|
||||
|
||||
Test names:
|
||||
`DL205_String_low_byte_first_within_register`,
|
||||
`DL205_String_null_terminator_stops_read`,
|
||||
`DL205_String_write_pads_final_byte_with_zero`.
|
||||
|
||||
## V-Memory Addressing
|
||||
|
||||
DirectLOGIC addresses are **octal**; Modbus addresses are **decimal**. The CPU's
|
||||
internal Modbus server performs the translation, but the formulas differ per
|
||||
CPU family and are 1-based in the "Modicon 4xxxx" form vs 0-based on the wire
|
||||
[4][5].
|
||||
|
||||
Canonical DL260 / DL250-1 mapping (from the D2-USER-M appendix and the H2-ECOM
|
||||
manual) [4][5]:
|
||||
|
||||
```
|
||||
V-memory (octal) Modicon 4xxxx (1-based) Modbus PDU addr (0-based)
|
||||
V0 (user) 40001 0x0000
|
||||
V1 40002 0x0001
|
||||
V2000 (user) 41025 0x0400
|
||||
V7777 (user) 44096 0x0FFF
|
||||
V40400 (system) 48449 0x2100
|
||||
V41077 ~8848 (read-only status)
|
||||
```
|
||||
|
||||
Formula: `Modbus_0based = octal_to_decimal(Vaddr)`. So `V2000` octal = `1024`
|
||||
decimal = Modbus PDU address `0x0400`. The "4xxxx" Modicon view just adds 1 and
|
||||
prefixes the register bank digit.
|
||||
|
||||
- **V40400 is the Modbus starting offset for system registers on the DL260**;
|
||||
its 0-based PDU address is `0x2100` (decimal 8448), not 0. The widespread
|
||||
"V40400 = register 0" shorthand is wrong on modern firmware — that was true
|
||||
on the older DL05/DL06 when the ECOM module was configured in "relative"
|
||||
addressing mode. On the H2-ECOM100 factory default ("absolute" mode), V40400
|
||||
maps to 0x2100 [5].
|
||||
- **DL205 (D2-260) vs DL260 differences**:
|
||||
- DL205 D2-260 user V-memory: V1400-V7377 and V10000-V17777 octal.
|
||||
- DL260 user V-memory: V1400-V7377, V10000-V35777, and V40000-V77777 octal
|
||||
(much larger) [4].
|
||||
- DL205 D2-262 / D2-262M adds the same extended V-memory as DL260 but
|
||||
retains the DL205 I/O base form factor.
|
||||
- Neither DL205 sub-model changes the *formula* — only the valid range.
|
||||
- **Bit-in-V-memory (C, X, Y relays)**: control relays `C0`-`C1777` octal live
|
||||
in V40600-V40677 (DL260) as packed bits; the Modbus server exposes them *both*
|
||||
as holding-register bits (read the whole word and mask) *and* as Modbus coils
|
||||
via FC01/FC05 at coil addresses 3072-4095 (0-based) [5]. `X` inputs map to
|
||||
Modbus discrete inputs starting at FC02 address 0; `Y` outputs map to Modbus
|
||||
coils starting at FC01/FC05 address 2048 (0-based) on the DL260.
|
||||
- **Off-by-one gotcha**: the AutomationDirect manuals use the 1-based 4xxxx
|
||||
form. Kepware, libmodbus, pymodbus, and the .NET stack all take the 0-based
|
||||
PDU form. When the manual says "V2000 = 41025" you send `0x0400`, not
|
||||
`0x0401`.
|
||||
|
||||
Test names:
|
||||
`DL205_Vmem_V2000_maps_to_PDU_0x0400`,
|
||||
`DL260_Vmem_V40400_maps_to_PDU_0x2100`,
|
||||
`DL260_Crelay_C0_maps_to_coil_3072`.
|
||||
|
||||
## Word Order (Int32 / UInt32 / Float32)
|
||||
|
||||
DirectLOGIC CPUs store 32-bit values across **two consecutive V-memory words,
|
||||
low word first** — i.e., `CDAB` when viewed as a Modbus register pair [1][3].
|
||||
Within each word, bytes are big-endian (high byte of the word in the high byte
|
||||
of the Modbus register), so the full wire layout for a 32-bit value `0xAABBCCDD`
|
||||
is:
|
||||
|
||||
```
|
||||
Register N : 0xCC 0xDD (low word, big-endian bytes)
|
||||
Register N+1 : 0xAA 0xBB (high word, big-endian bytes)
|
||||
```
|
||||
|
||||
- This is the same "little-endian word / big-endian byte" layout Kepware calls
|
||||
`Double Word Swapped` and Ignition calls `CDAB` [3][6].
|
||||
- **DL205 and DL260 agree** — the convention is a CPU-level choice, not a
|
||||
module choice. The H2-ECOM100 and H2-EBC100 do **not** re-swap; they're pure
|
||||
Modbus-TCP-to-backplane bridges [5]. The DL260 built-in Ethernet port
|
||||
behaves identically.
|
||||
- **Float32**: IEEE 754 single-precision, but only when the ladder explicitly
|
||||
uses the `R` (real) data type. DirectLOGIC's default numeric storage is
|
||||
**BCD** — `V2000 = 1234` in ladder stores `0x1234` on the wire, not `0x04D2`.
|
||||
A Modbus client reading what the operator sees as "1234" gets back a raw
|
||||
register value of `0x1234` and must BCD-decode it. Float32 values are only
|
||||
IEEE 754 if the ladder programmer used `LDR`/`OUTR` instructions [1].
|
||||
- **Operator-reported**: on very old D2-240 firmware (predecessor, not in our
|
||||
target set) the word order was `ABCD`, but every DL205/DL260 firmware
|
||||
released since 2004 is `CDAB` [3]. _Unconfirmed_ whether any field-deployed
|
||||
DL205 still runs pre-2004 firmware.
|
||||
|
||||
Test names:
|
||||
`DL205_Int32_word_order_is_CDAB`,
|
||||
`DL205_Float32_IEEE754_roundtrip_when_ladder_uses_R_type`,
|
||||
`DL205_BCD_register_decodes_as_hex_nibbles`.
|
||||
|
||||
## Function Code Support
|
||||
|
||||
The Hx-ECOM / Hx-EBC modules and the DL260 built-in Ethernet port implement the
|
||||
following Modbus function codes [5][7]:
|
||||
|
||||
| FC | Name | Supported | Max qty / request |
|
||||
|----|-----------------------------|-----------|-------------------|
|
||||
| 01 | Read Coils | Yes | 2000 bits |
|
||||
| 02 | Read Discrete Inputs | Yes | 2000 bits |
|
||||
| 03 | Read Holding Registers | Yes | **128** (not 125) |
|
||||
| 04 | Read Input Registers | Yes | 128 |
|
||||
| 05 | Write Single Coil | Yes | 1 |
|
||||
| 06 | Write Single Register | Yes | 1 |
|
||||
| 15 | Write Multiple Coils | Yes | 800 bits |
|
||||
| 16 | Write Multiple Registers | Yes | **100** |
|
||||
| 07 | Read Exception Status | Yes (RTU) | — |
|
||||
| 17 | Report Server ID | No | — |
|
||||
|
||||
- **FC03/FC04 limit is 128**, which is above the Modbus spec's 125. Requesting
|
||||
129+ returns exception code `03` (Illegal Data Value) [5].
|
||||
- **FC16 limit is 100**, below the spec's 123. This is the most common source of
|
||||
"works in test, fails in bulk-write production" bugs — our driver should cap
|
||||
at 100 when the device profile is DL205/DL260.
|
||||
- **No custom function codes** are exposed on the Modbus port. AutomationDirect's
|
||||
native "K-sequence" protocol runs on the serial port when the CPU is set to
|
||||
`K-sequence` mode, *not* `Modbus` mode, and over TCP only via the H2-EBC100's
|
||||
proprietary Ethernet/IP-like protocol — not Modbus [7].
|
||||
|
||||
Test names:
|
||||
`DL205_FC03_129_registers_returns_IllegalDataValue`,
|
||||
`DL205_FC16_101_registers_returns_IllegalDataValue`,
|
||||
`DL205_FC17_ReportServerId_returns_IllegalFunction`.
|
||||
|
||||
## Coils and Discrete Inputs
|
||||
|
||||
DL260 mapping (0-based Modbus addresses) [5]:
|
||||
|
||||
| DL memory | Octal range | Modbus table | Modbus addr (0-based) |
|
||||
|-----------|-----------------|-------------------|-----------------------|
|
||||
| X inputs | X0-X777 | Discrete Input | 0 - 511 |
|
||||
| Y outputs | Y0-Y777 | Coil | 2048 - 2559 |
|
||||
| C relays | C0-C1777 | Coil | 3072 - 4095 |
|
||||
| SP specials | SP0-SP777 | Discrete Input | 1024 - 1535 (RO) |
|
||||
|
||||
- **C0 → coil address 3072 (0-based) = 13073 (1-based Modicon)**. Y0 → coil
|
||||
2048 = 12049. These offsets are wired into the CPU and cannot be remapped.
|
||||
- **Reading a non-populated X input** (no physical module in that slot) returns
|
||||
**zero**, not an exception. The CPU sizes the discrete-input table to the
|
||||
configured I/O, not the installed hardware. Confirmed in the DL260 user
|
||||
manual's I/O configuration chapter [4].
|
||||
- **Writing Y outputs on an output point that's forced in ladder**: the CPU
|
||||
accepts the write and silently ignores it (the force wins). No exception is
|
||||
returned. _Operator-reported_, matches Kepware driver release notes [3].
|
||||
|
||||
Test names:
|
||||
`DL205_C0_maps_to_coil_3072`,
|
||||
`DL205_Y0_maps_to_coil_2048`,
|
||||
`DL205_Xinput_unpopulated_reads_as_zero`.
|
||||
|
||||
## Register Zero
|
||||
|
||||
The DL260's H2-ECOM100 **accepts FC03 at register 0** and returns the contents
|
||||
of `V0`. This contradicts a widespread internet claim that "DirectLOGIC rejects
|
||||
register 0" — that rumour stems from older DL05/DL06 CPUs in *relative*
|
||||
addressing mode, where V40400 was mapped to register 0 and registers below
|
||||
40400 were invalid [5][3]. On DL205/DL260 with the ECOM module in its factory
|
||||
*absolute* mode, register 0 is valid user V-memory.
|
||||
|
||||
- Our driver's `ModbusProbeOptions.ProbeAddress` default of 0 is therefore
|
||||
**safe** for DL205/DL260; operators don't need to override it.
|
||||
- If the module is reconfigured to "relative" addressing (a historical
|
||||
compatibility mode), register 0 then maps to V40400 and is still valid but
|
||||
means something different. The probe will still succeed.
|
||||
|
||||
Test name: `DL205_FC03_register_0_returns_V0_contents`.
|
||||
|
||||
## Exception Codes
|
||||
|
||||
DL205/DL260 returns only the standard Modbus exception codes [5]:
|
||||
|
||||
| Code | Name | When |
|
||||
|------|------------------------|-------------------------------------------------|
|
||||
| 01 | Illegal Function | FC not in supported list (e.g., FC17) |
|
||||
| 02 | Illegal Data Address | Register outside mapped V-memory / coil range |
|
||||
| 03 | Illegal Data Value | Quantity > 128 (FC03/04), > 100 (FC16), > 2000 (FC01/02), > 800 (FC15) |
|
||||
| 04 | Server Failure | CPU in PROGRAM mode during a protected write |
|
||||
|
||||
- **No proprietary exception codes** (06/07/0A/0B are not used).
|
||||
- **Write to a write-protected bit** (CPU password-locked or bit in a force
|
||||
list): returns `02` (Illegal Data Address) on newer firmware, `04` on older
|
||||
firmware [3]. _Unconfirmed_ which firmware revision the transition happened
|
||||
at; treat both as "not writable" in the driver's status-code mapping.
|
||||
- **Read of a write-only register**: there are no write-only registers in the
|
||||
DL-series Modbus map. Every writable register is also readable.
|
||||
|
||||
Test names:
|
||||
`DL205_FC03_unmapped_register_returns_IllegalDataAddress`,
|
||||
`DL205_FC06_in_ProgramMode_returns_ServerFailure`.
|
||||
|
||||
## Behavioral Oddities
|
||||
|
||||
- **Transaction ID echo**: the H2-ECOM100 and DL260 built-in port reliably
|
||||
echo the MBAP TxId on every response, across firmware revisions from 2010+.
|
||||
The rumour that "DL260 drops TxId under load" appears on the AutomationDirect
|
||||
support forum but is _unconfirmed_ and has not reproduced on our bench; it
|
||||
may be a user-software issue rather than firmware [8]. Our driver's
|
||||
single-flight + TxId-match guard handles it either way.
|
||||
- **Concurrency**: the ECOM serializes requests internally. Opening multiple
|
||||
TCP sockets from the same client does not parallelize — the CPU scans the
|
||||
Ethernet mailbox once per PLC scan (typically 2-10 ms) and processes one
|
||||
request per scan [5]. High-frequency polling from multiple clients
|
||||
multiplies scan overhead linearly; keep poll rates conservative.
|
||||
- **Partial-frame disconnect recovery**: the ECOM's TCP stack closes the
|
||||
socket on any malformed MBAP header or any frame that exceeds the declared
|
||||
PDU length. It does not resynchronize mid-stream. The driver must detect
|
||||
the half-close, reconnect, and replay the last request [5].
|
||||
- **Keepalive**: the ECOM does **not** send TCP keepalives. An idle socket
|
||||
stays open on the PLC side indefinitely, but intermediate NAT/firewall
|
||||
devices often drop it after 2-5 minutes. Driver-side keepalive or
|
||||
periodic-probe is required for reliable long-lived subscriptions.
|
||||
- **Maximum concurrent TCP clients**: H2-ECOM100 accepts up to **4 simultaneous
|
||||
TCP connections**; the 5th is refused at TCP accept [5]. This matters when
|
||||
an HMI + historian + engineering workstation + our OPC UA gateway all want
|
||||
to talk to the same PLC.
|
||||
|
||||
Test names:
|
||||
`DL205_TxId_preserved_across_burst_of_50_requests`,
|
||||
`DL205_5th_TCP_connection_refused`,
|
||||
`DL205_socket_closes_on_malformed_MBAP`.
|
||||
|
||||
## References
|
||||
|
||||
1. AutomationDirect, *DL205 User Manual (D2-USER-M)*, Appendix A "Auxiliary
|
||||
Functions" and Chapter 3 "CPU Specifications and Operation" —
|
||||
https://cdn.automationdirect.com/static/manuals/d2userm/d2userm.html
|
||||
2. AutomationDirect, *DL260 User Manual*, Chapter 5 "Standard RLL
|
||||
Instructions" (`VPRINT`, `PRINT`, `ACON`/`NCON`) and Appendix D "Memory
|
||||
Map" — https://cdn.automationdirect.com/static/manuals/d2userm/d2userm.html
|
||||
3. Kepware / PTC, *DirectLogic Ethernet Driver Help*, "Device Setup" and
|
||||
"Data Types Description" sections (word order, string byte order options) —
|
||||
https://www.kepware.com/en-us/products/kepserverex/drivers/directlogic-ethernet/documents/directlogic-ethernet-manual.pdf
|
||||
4. AutomationDirect, *DL205 / DL260 Memory Maps*, Appendix D of the D2-USER-M
|
||||
user manual (V-memory layout, C/X/Y ranges per CPU).
|
||||
5. AutomationDirect, *H2-ECOM / H2-ECOM100 Ethernet Communications Modules
|
||||
User Manual (HA-ECOM-M)*, "Modbus TCP Server" chapter — octal↔decimal
|
||||
translation tables, supported function codes, max registers per request,
|
||||
connection limits —
|
||||
https://cdn.automationdirect.com/static/manuals/hxecomm/hxecomm.html
|
||||
6. Inductive Automation, *Ignition Modbus Driver — Address Mapping*, word
|
||||
order options (ABCD/CDAB/BADC/DCBA) —
|
||||
https://docs.inductiveautomation.com/docs/8.1/ignition-modules/opc-ua/drivers/modbus-v2
|
||||
7. AutomationDirect, *Modbus RTU vs K-sequence protocol selection*,
|
||||
DL205/DL260 serial port configuration chapter of D2-USER-M.
|
||||
8. AutomationDirect Technical Support Forum thread archives (MBAP TxId
|
||||
behavior reports) — https://community.automationdirect.com/ (search:
|
||||
"ECOM100 transaction id"). _Unconfirmed_ operator reports only.
|
||||
149
docs/v2/implementation/phase-6-1-resilience-and-observability.md
Normal file
149
docs/v2/implementation/phase-6-1-resilience-and-observability.md
Normal file
@@ -0,0 +1,149 @@
|
||||
# Phase 6.1 — Resilience & Observability Runtime
|
||||
|
||||
> **Status**: DRAFT — implementation plan for a cross-cutting phase that was never formalised. The v2 `plan.md` specifies Polly, Tier A/B/C protections, structured logging, and local-cache fallback by decision; none are wired end-to-end.
|
||||
>
|
||||
> **Branch**: `v2/phase-6-1-resilience-observability`
|
||||
> **Estimated duration**: 3 weeks
|
||||
> **Predecessor**: Phase 5 (drivers) — partial; S7 + OPC UA Client shipped, AB/TwinCAT/FOCAS paused
|
||||
> **Successor**: Phase 6.2 (Authorization runtime)
|
||||
|
||||
## Phase Objective
|
||||
|
||||
Land the cross-cutting runtime protections + operability features that `plan.md` + `driver-stability.md` specify by decision but that no driver-phase actually wires. End-state: every driver goes through the same Polly resilience layer, health endpoints render the live driver fleet, structured logs carry per-request correlation IDs, and the config substrate survives a central DB outage via a LiteDB local cache.
|
||||
|
||||
Closes these gaps flagged in the 2026-04-19 audit:
|
||||
|
||||
1. Polly v8 resilience pipelines wired to every `IDriver` capability (no-op per-driver today; Galaxy has a hand-rolled `CircuitBreaker` only).
|
||||
2. Tier A/B/C enforcement at runtime — `driver-stability.md` §2–4 and decisions #63–73 define memory watchdog, bounded queues, scheduled recycle, wedge detection; `MemoryWatchdog` exists only inside `Driver.Galaxy.Host`.
|
||||
3. Health endpoints (`/healthz`, `/readyz`) on `OtOpcUa.Server`.
|
||||
4. Structured Serilog with per-request correlation IDs (driver instance, OPC UA session, IPC call).
|
||||
5. LiteDB local cache + Polly retry + fallback on central-DB outage (decision #36).
|
||||
|
||||
## Scope — What Changes
|
||||
|
||||
| Concern | Change |
|
||||
|---------|--------|
|
||||
| `Core` → new `Core.Resilience` sub-namespace | Shared Polly pipeline builder (`DriverResiliencePipelines`). **Pipeline key = `(DriverInstanceId, HostName)`** so one dead PLC behind a multi-device driver doesn't open the breaker for healthy siblings (decision #35 per-device isolation). **Per-capability policy** — Read / HistoryRead / Discover / Probe / Alarm get retries; **Write does NOT** unless `[WriteIdempotent]` on the tag definition (decisions #44-45). |
|
||||
| Every capability-interface consumer in the server | Wrap `IReadable.ReadAsync`, `IWritable.WriteAsync`, `ITagDiscovery.DiscoverAsync`, `ISubscribable.SubscribeAsync/UnsubscribeAsync`, `IHostConnectivityProbe` probe loop, `IAlarmSource.SubscribeAlarmsAsync/AcknowledgeAsync`, `IHistoryProvider.ReadRawAsync/ReadProcessedAsync/ReadAtTimeAsync/ReadEventsAsync`. Composition: timeout → (retry when capability supports) → circuit breaker → bulkhead. |
|
||||
| `Core.Abstractions` → new `WriteIdempotentAttribute` | Marker on `ModbusTagDefinition` / `S7TagDefinition` / `OpcUaClientDriver` tag rows; opts that tag into auto-retry on Write. Absence = no retry, per spec. |
|
||||
| `Core` → new `Core.Stability` sub-namespace — **split** | Two separate subsystems: (a) **`MemoryTracking`** runs all tiers; captures baseline (median of first 5 min `GetMemoryFootprint` samples) + applies the hybrid rule `soft = max(multiplier × baseline, baseline + floor)`; soft breach logs + surfaces to Admin; never kills. (b) **`MemoryRecycle`** (Tier C only — requires out-of-process topology) handles hard-breach recycle via the Proxy-side supervisor. Tier A/B overrun escalates to Tier C promotion ticket, not auto-kill. |
|
||||
| `ScheduledRecycleScheduler` | Tier C only per decisions #73-74. Weekly/time-of-day recycle via Proxy supervisor. Tier A/B opt-in recycle lands in a future phase together with a Tier-C-escalation workflow. |
|
||||
| `WedgeDetector` | **Demand-aware**: flips a driver to Faulted only when `(hasPendingWork AND noProgressIn > threshold)`. `hasPendingWork` derives from non-zero Polly bulkhead depth OR ≥1 active MonitoredItem OR ≥1 queued historian read. Idle + subscription-only drivers stay Healthy. |
|
||||
| `DriverTypeRegistry` | Each driver type registers its `DriverTier` {A, B, C}. Tier C drivers must advertise their out-of-process topology; the registry enforces invariants (Tier C has a `Proxy` + `Host` pair). |
|
||||
| `Driver.Galaxy.Proxy/Supervisor/` | **Retains** existing `CircuitBreaker` + `Backoff` — they guard IPC respawn (decision #68), different concern from the per-call Polly layer. Only `HeartbeatMonitor` is referenced downstream (IPC liveness). |
|
||||
| `OtOpcUa.Server` → Minimal API endpoints on `http://+:4841` | `/healthz` = process alive + (config DB reachable OR `UsingStaleConfig=true`). `/readyz` = ANDed driver health; state-machine per `DriverState`: `Unknown`/`Initializing` → 503, `Healthy` → 200, `Degraded` → 200 + `{degradedDrivers: [...]}` in body, `Faulted` → 503. JSON body always reports per-instance detail. |
|
||||
| Serilog configuration | Centralize enrichers in `OtOpcUa.Server/Observability/LogContextEnricher.cs`. Every capability call runs inside a `LogContext.PushProperty` scope with {DriverInstanceId, DriverType, CapabilityName, CorrelationId (UA RequestHandle or internal GUID)}. Sink config stays rolling-file per CLAUDE.md; JSON sink added alongside plain-text (switchable via `Serilog:WriteJson` appsetting). |
|
||||
| `Configuration` project | Add `LiteDbConfigCache` adapter. **Generation-sealed snapshots**: `sp_PublishGeneration` writes `<cache-root>/<cluster>/<generationId>.db` as a read-only sealed file. Reads serve the last-known-sealed generation; mixed-generation reads are impossible. Write path bypasses cache + fails hard on DB outage. Pipeline: timeout (2 s) → retry (3×, jittered) → fallback-to-sealed-snapshot. |
|
||||
| `DriverHostStatus` vs. `DriverInstanceResilienceStatus` | New separate entity `DriverInstanceResilienceStatus { DriverInstanceId, HostName, LastCircuitBreakerOpenUtc, ConsecutiveFailures, CurrentBulkheadDepth, LastRecycleUtc, BaselineFootprintBytes }`. `DriverHostStatus` keeps per-host connectivity only; Admin `/hosts` joins both for display. |
|
||||
|
||||
## Scope — What Does NOT Change
|
||||
|
||||
| Item | Reason |
|
||||
|------|--------|
|
||||
| Driver wire protocols | Resilience is a server-side wrapper; individual drivers don't see Polly. Their existing retry logic (ModbusTcpTransport reconnect, SessionReconnectHandler) stays in place as inner layers. |
|
||||
| Config DB schema | LiteDB cache is a read-only mirror; no new central tables except `DriverHostStatus` column additions. |
|
||||
| OPC UA wire behavior visible to clients | Health endpoints live on a separate HTTP port (4841 by convention); the OPC UA server on 4840 is unaffected. |
|
||||
| The four 2026-04-13 Galaxy stability findings | Already closed in Phase 2. Phase 6.1 *generalises* the pattern, doesn't re-fix Galaxy. |
|
||||
| Driver-layer SafeHandle usage | Existing Galaxy `SafeMxAccessHandle` + Modbus `TcpClient` disposal stay — they're driver-internal, not part of the cross-cutting layer. |
|
||||
|
||||
## Entry Gate Checklist
|
||||
|
||||
- [ ] Phases 0–5 exit gates cleared (or explicitly deferred with task reference)
|
||||
- [ ] `driver-stability.md` §2–4 re-read; decisions #63–73 + #34–36 re-skimmed
|
||||
- [ ] Polly v8 NuGet available (`Microsoft.Extensions.Resilience` + `Polly.Core`) — verify package restore before task breakdown
|
||||
- [ ] LiteDB 5.x NuGet confirmed MIT + actively maintained
|
||||
- [ ] Existing drivers catalogued: Galaxy.Proxy, Modbus, S7, OpcUaClient — confirm test counts baseline so the resilience layer doesn't regress any
|
||||
- [ ] Serilog configuration inventory: locate every `Log.ForContext` call site that will need `LogContext` rewrap
|
||||
- [ ] Admin `/hosts` page's current `DriverHostStatus` consumption reviewed so the schema extensions don't break it
|
||||
|
||||
## Task Breakdown
|
||||
|
||||
### Stream A — Resilience layer (1 week)
|
||||
|
||||
1. **A.1** Add `Polly.Core` + `Microsoft.Extensions.Resilience` to `Core`. Build `DriverResiliencePipelineBuilder` — key on `(DriverInstanceId, HostName)`; composes Timeout → (Retry when the capability allows it; skipped for Write unless `[WriteIdempotent]`) → CircuitBreaker → Bulkhead. Per-capability policy map documented in `DriverResilienceOptions.CapabilityPolicies`.
|
||||
2. **A.2** `DriverResilienceOptions` record bound from `DriverInstance.ResilienceConfig` JSON column (new nullable). **Per-tier × per-capability** defaults: Tier A (OpcUaClient, S7) Read 3 retries/2 s/5-failure-breaker, Write 0 retries/2 s/5-failure-breaker; Tier B (Modbus) Read 3/4 s/5, Write 0/4 s/5; Tier C (Galaxy) Read 1 retry/10 s/no-kill, Write 0/10 s/no-kill. Idempotent writes can opt into Read-shaped retry via the attribute.
|
||||
3. **A.3** `CapabilityInvoker<TCapability, TResult>` wraps every method on the capability interfaces (`IReadable.ReadAsync`, `IWritable.WriteAsync`, `ITagDiscovery.DiscoverAsync`, `ISubscribable.SubscribeAsync/UnsubscribeAsync`, `IHostConnectivityProbe` probe loop, `IAlarmSource.SubscribeAlarmsAsync/AcknowledgeAsync`, `IHistoryProvider.ReadRawAsync/ReadProcessedAsync/ReadAtTimeAsync/ReadEventsAsync`). Existing server-side dispatch routes through it.
|
||||
4. **A.4** **Retain** `Driver.Galaxy.Proxy/Supervisor/CircuitBreaker.cs` + `Backoff.cs` — they guard IPC process respawn (decision #68), orthogonal to the per-call Polly layer. Only `HeartbeatMonitor` is consumed outside the supervisor.
|
||||
5. **A.5** Unit tests: per-policy, per-composition. Negative integration tests: (a) Modbus FlakeyTransport fails 5× on Read, succeeds 6th — invoker surfaces success; (b) Modbus FlakeyTransport fails 1× on Write with `[WriteIdempotent]=false` — invoker surfaces failure without retry (no duplicate pulse); (c) Modbus FlakeyTransport fails 1× on Write with `[WriteIdempotent]=true` — invoker retries. Bench: no-op overhead < 1%.
|
||||
6. **A.6** `WriteIdempotentAttribute` in `Core.Abstractions`. Modbus/S7/OpcUaClient tag-definition records pick it up; invoker reads via reflection once at driver init.
|
||||
|
||||
### Stream B — Tier A/B/C stability runtime — split into MemoryTracking + MemoryRecycle (1 week)
|
||||
|
||||
1. **B.1** `Core.Abstractions` → `DriverTier` enum {A, B, C}. Extend `DriverTypeRegistry` to require `DriverTier` at registration. Existing driver types stamped (Galaxy = C, Modbus = B, S7 = B, OpcUaClient = A).
|
||||
2. **B.2** **`MemoryTracking`** (all tiers) lifted from `Driver.Galaxy.Host/MemoryWatchdog.cs`. Captures `BaselineFootprintBytes` as the median of first 5 min of `IDriver.GetMemoryFootprint()` samples post-`InitializeAsync`. Applies **decision #70 hybrid formula**: `soft = max(multiplier × baseline, baseline + floor)`; Tier A multiplier=3, floor=50 MB; Tier B multiplier=3, floor=100 MB; Tier C multiplier=2, floor=500 MB. Soft breach → log + `DriverInstanceResilienceStatus.CurrentFootprint` tick; never kills. Hard = 2 × soft.
|
||||
3. **B.3** **`MemoryRecycle`** (Tier C only per decisions #73-74). Hard-breach on a Tier C driver triggers `ScheduledRecycleScheduler.RequestRecycleNow(driverInstanceId)`; scheduler proxies to `Driver.Galaxy.Proxy/Supervisor/` which restarts the Host process. Tier A/B hard-breach logs a promotion-to-Tier-C recommendation; **never auto-kills** the in-process driver.
|
||||
4. **B.4** **`ScheduledRecycleScheduler`** per decision #67: Tier C driver instances opt-in to a weekly recycle at a configured cron. Tier A/B scheduled recycle deferred to a later phase paired with Tier-C escalation.
|
||||
5. **B.5** **`WedgeDetector`** demand-aware: `if (state==Healthy && hasPendingWork && noProgressIn > WedgeThreshold) → force ReinitializeAsync`. `hasPendingWork` = (bulkhead depth > 0) OR (active monitored items > 0) OR (queued historian-read count > 0). `WedgeThreshold` default 5 × PublishingInterval, min 60 s. Idle driver stays Healthy.
|
||||
6. **B.6** Tests: tracking unit tests drive synthetic allocation against a fake `GetMemoryFootprint`; recycle tests use a mock supervisor; wedge tests include the false-fault cases — idle subscriber, slow historian backfill, write-only burst.
|
||||
|
||||
### Stream C — Health endpoints + structured logging (4 days)
|
||||
|
||||
1. **C.1** `OtOpcUa.Server/Observability/HealthEndpoints.cs` — Minimal API on a second Kestrel binding (default `http://+:4841`). `/healthz` reports process uptime + config-DB reachability (or cache-warm). `/readyz` enumerates `DriverInstance` rows + reports each driver's `DriverHealth.State`; returns 503 if ANY driver is Faulted. JSON body per `docs/v2/acl-design.md` §"Operator Dashboards" shape.
|
||||
2. **C.2** `LogContextEnricher` installed at Serilog config time. Every driver-capability call site wraps its body in `using (LogContext.PushProperty("DriverInstanceId", id)) using (LogContext.PushProperty("CorrelationId", correlationId))`. Correlation IDs: reuse OPC UA `RequestHeader.RequestHandle` when in-flight; otherwise generate `Guid.NewGuid().ToString("N")[..12]`.
|
||||
3. **C.3** Add JSON-formatted Serilog sink alongside the existing rolling-file plain-text sink so SIEMs (Splunk, Datadog) can ingest without a regex parser. Sink switchable via `Serilog:WriteJson` appsetting.
|
||||
4. **C.4** Integration test: boot server, issue Modbus read, assert log line contains `DriverInstanceId` + `CorrelationId` structured fields.
|
||||
|
||||
### Stream D — Config DB LiteDB fallback — generation-sealed snapshots (1 week)
|
||||
|
||||
1. **D.1** `LiteDbConfigCache` adapter backed by **sealed generation snapshots**: each successful `sp_PublishGeneration` writes `<cache-root>/<clusterId>/<generationId>.db` as read-only after commit. The adapter maintains a `CurrentSealedGenerationId` pointer updated atomically on successful publish. Mixed-generation reads are **impossible** — every read served from the cache serves one coherent sealed generation.
|
||||
2. **D.2** Write-path queries (draft save, publish) bypass the cache entirely and fail hard on DB outage. Read-path queries (DriverInstance enumeration, LdapGroupRoleMapping, cluster + namespace metadata) go through the pipeline: timeout 2 s → retry 3× jittered → fallback to the current sealed snapshot.
|
||||
3. **D.3** `UsingStaleConfig` flag flips true when a read fell back to the sealed snapshot; cleared on the next successful DB round-trip. Surfaced on `/healthz` body and Admin `/hosts`.
|
||||
4. **D.4** Tests: (a) SQL-container kill mid-operation — read returns sealed snapshot, `UsingStaleConfig=true`, driver stays Healthy; (b) mixed-generation guard — attempt to serve partial generation by corrupting a snapshot file mid-read → adapter fails closed rather than serving mixed data; (c) first-boot-no-snapshot case — adapter refuses to start, driver fails `InitializeAsync` with a clear config-DB-required error.
|
||||
|
||||
### Stream E — Admin `/hosts` page refresh (3 days)
|
||||
|
||||
1. **E.1** Extend `DriverHostStatus` schema with Stream A resilience columns. Generate EF migration.
|
||||
2. **E.2** `Admin/FleetStatusHub` SignalR hub pushes `LastCircuitBreakerOpenUtc` + `CurrentBulkheadDepth` + `LastRecycleUtc` on change.
|
||||
3. **E.3** `/hosts` Blazor page renders new columns; red badge if `ConsecutiveFailures > breakerThreshold / 2`.
|
||||
|
||||
## Compliance Checks (run at exit gate)
|
||||
|
||||
- [ ] **Invoker coverage**: every method on `IReadable` / `IWritable` / `ITagDiscovery` / `ISubscribable` / `IHostConnectivityProbe` / `IAlarmSource` / `IHistoryProvider` in the server dispatch layer routes through `CapabilityInvoker`. Enforce via a Roslyn analyzer (error-level; warning-first is rejected — the compliance check is the gate).
|
||||
- [ ] **Write-retry guard**: writes without `[WriteIdempotent]` never get retried. Unit-test the invoker path asserts zero retry attempts.
|
||||
- [ ] **Pipeline isolation**: pipeline key is `(DriverInstanceId, HostName)`. Integration test with two Modbus hosts under one instance — failing host A does not open the breaker for host B.
|
||||
- [ ] **Tier registry**: every driver type registered in `DriverTypeRegistry` has a non-null `Tier`. Unit test walks the registry + asserts no gaps. Tier C registrations must declare their out-of-process topology.
|
||||
- [ ] **MemoryTracking never kills**: soft/hard breach tests on a Tier A/B driver log + surface without terminating the process.
|
||||
- [ ] **MemoryRecycle Tier C only**: hard breach on a Tier A driver never invokes the supervisor; on Tier C it does.
|
||||
- [ ] **Wedge demand-aware**: test suite includes idle-subscription-only, slow-historian-backfill, and write-only-burst cases — driver stays Healthy.
|
||||
- [ ] **Galaxy supervisor preserved**: `Driver.Galaxy.Proxy/Supervisor/CircuitBreaker.cs` + `Backoff.cs` still present + still invoked on Host crash.
|
||||
- [ ] **Health state machine**: `/healthz` + `/readyz` respond within 500 ms for every `DriverState`; state-machine table in this doc drives the test matrix.
|
||||
- [ ] **Structured log**: CI grep asserts at least one log line per capability call has `"DriverInstanceId"` + `"CorrelationId"` JSON fields.
|
||||
- [ ] **Generation-sealed cache**: integration tests cover (a) SQL-kill mid-operation serves last-sealed snapshot; (b) mixed-generation corruption fails closed; (c) first-boot no-snapshot + DB-down → `InitializeAsync` fails with clear error.
|
||||
- [ ] No regression in existing test suites — `dotnet test ZB.MOM.WW.OtOpcUa.slnx` count equal-or-greater than pre-Phase-6.1 baseline.
|
||||
|
||||
## Risks and Mitigations
|
||||
|
||||
| Risk | Likelihood | Impact | Mitigation |
|
||||
|------|:----------:|:------:|------------|
|
||||
| Polly pipeline adds per-request latency on hot path | Medium | Medium | Benchmark Stream A.5 before merging; 1 % overhead budget; inline hot path short-circuits when retry count = 0 |
|
||||
| LiteDB cache diverges from central DB | Medium | High | Stale-data banner in Admin UI; `UsingStaleConfig` flag surfaced on `/readyz`; cache refresh on every successful DB round-trip; 24-hour synthetic warning |
|
||||
| Tier watchdog false-positive-kills a legitimate batch load | Low | High | Soft/hard threshold split; soft only logs; hard triggers recycle; thresholds configurable per-instance |
|
||||
| Wedge detector races with slow-but-healthy drivers | Medium | High | Minimum 60 s threshold; detector only activates if driver claims `Healthy`; add circuit-breaker feedback so rapid oscillation trips instead of thrashing |
|
||||
| Roslyn analyzer breaks external driver authors | Low | Medium | Release analyzer as warning-level initially; upgrade to error in Phase 6.1+1 after one release cycle |
|
||||
|
||||
## Completion Checklist
|
||||
|
||||
- [ ] Stream A: Polly shared pipeline + per-tier defaults + driver-capability invoker + tests
|
||||
- [ ] Stream B: Tier registry + generalised watchdog + scheduled recycle + wedge detector
|
||||
- [ ] Stream C: `/healthz` + `/readyz` + structured logging + JSON Serilog sink
|
||||
- [ ] Stream D: LiteDB cache + Polly fallback in Configuration
|
||||
- [ ] Stream E: Admin `/hosts` page refresh
|
||||
- [ ] Cross-cutting: `phase-6-1-compliance.ps1` exits 0; full solution `dotnet test` passes; exit-gate doc recorded
|
||||
|
||||
## Adversarial Review — 2026-04-19 (Codex, thread `019da489-e317-7aa1-ab1f-6335e0be2447`)
|
||||
|
||||
Plan substantially rewritten before implementation to address these findings. Each entry: severity · verdict · adjustment.
|
||||
|
||||
1. **Crit · ACCEPT** — Auto-retry collides with decisions #44/#45 (no auto-write-retry; opt-in via `WriteIdempotent` + CAS). Pipeline now **capability-specific**: Read/HistoryRead/Discover/Probe/Alarm-subscribe all get retries; **Write does not** unless the tag metadata carries `WriteIdempotent=true`. New `WriteIdempotentAttribute` surfaces on `ModbusTagDefinition` / `S7TagDefinition` / etc.
|
||||
2. **Crit · ACCEPT** — "One pipeline per driver instance" breaks decision #35's per-device isolation. **Change**: pipeline key is `(DriverInstanceId, HostName)` not just `DriverInstanceId`. One dead PLC behind a multi-device Modbus driver no longer opens the breaker for healthy siblings.
|
||||
3. **Crit · ACCEPT** — Memory watchdog + scheduled recycle at Tier A/B breaches decisions #73/#74 (process-kill protections are Tier-C-only). **Change**: Stream B splits into two — `MemoryTracking` (all tiers, soft/hard thresholds log + surface to Admin `/hosts`; never kills) and `MemoryRecycle` (Tier C only, requires out-of-process topology). Tier A/B overrun paths escalate to Tier C via a future PR, not auto-kill.
|
||||
4. **High · ACCEPT** — Removing Galaxy's hand-rolled `CircuitBreaker` drops decision #68 host-supervision crash-loop protection. **Change**: keep `Driver.Galaxy.Proxy/Supervisor/CircuitBreaker.cs` + `Backoff.cs` — they guard the IPC *process* re-spawn, not the per-call data path. Data-path Polly is an orthogonal layer.
|
||||
5. **High · ACCEPT** — Roslyn analyzer targeting `IDriver` misses the hot paths (`IReadable.ReadAsync`, `IWritable.WriteAsync`, `ISubscribable.SubscribeAsync` etc.). **Change**: analyzer rule now matches every method on the capability interfaces; compliance doc enumerates the full call-site list.
|
||||
6. **High · ACCEPT** — `/healthz` + `/readyz` under-specified for degraded-running. **Change**: add a state-matrix sub-section explicitly covering `Unknown` (pre-init: `/readyz` 503), `Initializing` (503), `Healthy` (200), `Degraded` (200 with JSON body flagging the degraded driver; `/readyz` is OR across drivers), `Faulted` (503), plus cached-config-serving (`/healthz` returns 200 + `UsingStaleConfig: true` in JSON body).
|
||||
7. **High · ACCEPT** — `WedgeDetector` based on "no successful Read" false-fires on write-only subscriptions + idle systems. **Change**: wedge criteria now `(hasPendingWork AND noProgressIn > threshold)` where `hasPendingWork` comes from the Polly bulkhead depth + active MonitoredItem count. Idle driver stays Healthy.
|
||||
8. **High · ACCEPT** — LiteDB cache serving mixed-generation reads breaks publish atomicity. **Change**: cache is snapshot-per-generation. Each published generation writes a sealed snapshot into `<cache-root>/<cluster>/<generationId>.db`; reads serve the last-known-sealed generation and never mix. Central DB outage during a *publish* means that publish fails (write path doesn't use cache); reads continue from the prior sealed snapshot.
|
||||
9. **Med · ACCEPT** — `DriverHostStatus` schema conflates per-host connectivity with per-driver-instance resilience counters. **Change**: new `DriverInstanceResilienceStatus` table separate from `DriverHostStatus`. Admin `/hosts` joins both for display.
|
||||
10. **Med · ACCEPT** — Compliance says analyzer-error; risks say analyzer-warning. **Change**: phase 6.1 ships at **error** level (this phase is the gate); warning-mode option removed.
|
||||
11. **Med · ACCEPT** — Hardcoded per-tier MB bands ignore decision #70's `max(multiplier × baseline, baseline + floor)` formula with observed-baseline capture. **Change**: watchdog captures baseline at post-init plateau (median of first 5 min GetMemoryFootprint samples) + applies the hybrid formula. Tier constants now encode the multiplier + floor, not raw MB.
|
||||
12. **Med · ACCEPT** — Tests mostly cover happy path. **Change**: Stream A.5 adds negative tests for duplicate-write-replay-under-timeout; Stream B.5 adds false-wedge-on-idle-subscription + false-wedge-on-slow-historic-backfill; Stream D.4 adds mixed-generation cache test + corrupt-first-boot cache test.
|
||||
|
||||
147
docs/v2/implementation/phase-6-2-authorization-runtime.md
Normal file
147
docs/v2/implementation/phase-6-2-authorization-runtime.md
Normal file
@@ -0,0 +1,147 @@
|
||||
# Phase 6.2 — Authorization Runtime (ACL + LDAP grants)
|
||||
|
||||
> **Status**: DRAFT — the v2 `plan.md` decision #129 + `acl-design.md` specify a 6-level permission-trie evaluator with `NodePermissions` bitmask grants, but no runtime evaluator exists. ACL tables are schematized but unread by the data path.
|
||||
>
|
||||
> **Branch**: `v2/phase-6-2-authorization-runtime`
|
||||
> **Estimated duration**: 2.5 weeks
|
||||
> **Predecessor**: Phase 6.1 (Resilience & Observability) — reuses the Polly pipeline for ACL-cache refresh retries
|
||||
> **Successor**: Phase 6.3 (Redundancy)
|
||||
|
||||
## Phase Objective
|
||||
|
||||
Wire ACL enforcement on every OPC UA Read / Write / Subscribe / Call path + LDAP group → admin role grants that the v2 plan specified but never ran. End-state: a user's effective permissions resolve through a per-session permission-trie over the 6-level `Cluster / Namespace / UnsArea / UnsLine / Equipment / Tag` hierarchy, cached per session, invalidated on generation-apply + LDAP group expiry.
|
||||
|
||||
Closes these gaps:
|
||||
|
||||
1. **Data-path ACL enforcement** — `NodeAcl` table + `NodePermissions` flags shipped; `NodeAclService.cs` present as a CRUD surface; no code consults ACLs at `Read`/`Write` time. OPC UA server answers everything to everyone.
|
||||
2. **`LdapGroupRoleMapping` for cluster-scoped admin grants** — decision #105 shipped as the *design*; admin roles are hardcoded (`FleetAdmin` / `ConfigEditor` / `ReadOnly`) with no cluster-scoping and no LDAP-to-grant table. Decision #105 explicitly lifts this from v2.1 into v2.0.
|
||||
3. **Explicit Deny pathway** — deferred to v2.1 (decision #129 note). Phase 6.2 ships *grants only*; `Deny` stays out.
|
||||
4. **Admin UI ACL grant editor** — `AclsTab.razor` exists but edits the now-unused `NodeAcl` table; needs to wire to the runtime evaluator + the new `LdapGroupRoleMapping` table.
|
||||
|
||||
## Scope — What Changes
|
||||
|
||||
**Architectural separation** (critical for correctness): `LdapGroupRoleMapping` is **control-plane only** — it maps LDAP groups to Admin UI roles (`FleetAdmin` / `ConfigEditor` / `ReadOnly`) and cluster scopes for Admin access. **It is NOT consulted by the OPC UA data-path evaluator.** The data-path evaluator reads `NodeAcl` rows joined directly against the session's **resolved LDAP group memberships**. The two concerns share zero runtime code path.
|
||||
|
||||
| Concern | Change |
|
||||
|---------|--------|
|
||||
| `Configuration` project | New entity `LdapGroupRoleMapping { Id, LdapGroup, Role, ClusterId? (nullable = system-wide), IsSystemWide, GeneratedAtUtc }`. **Consumed only by Admin UI role routing.** Migration. Admin CRUD. |
|
||||
| `Core` → new `Core.Authorization` sub-namespace | `IPermissionEvaluator.Authorize(IEnumerable<Claim> identity, OpcUaOperation op, NodeId nodeId) → AuthorizationDecision`. `op` covers every OPC UA surface: Browse, Read, Write, HistoryRead, HistoryUpdate, CreateMonitoredItems, TransferSubscriptions, Call, Acknowledge, Confirm, Shelve. Result is tri-state (internal model distinguishes `Allow` / `NotGranted` / `Denied` + carries matched-grant provenance). Phase 6.2 only produces `Allow` + `NotGranted`; v2.1 Deny lands without API break. |
|
||||
| `PermissionTrieBuilder` | Builds trie from `NodeAcl` rows joined against **resolved LDAP group memberships**, keyed on 6-level scope hierarchy for Equipment namespaces. **SystemPlatform namespaces (Galaxy)** use a `FolderSegment` scope level between Namespace and Tag, populated from `Tag.FolderPath` segments, so folder subtree authorization works on Galaxy trees the same way UNS works on Equipment trees. Trie node carries `ScopeKind` enum. |
|
||||
| `PermissionTrieCache` + freshness | One trie per `(ClusterId, GenerationId)`. Invalidated on `sp_PublishGeneration` via in-process event bus AND generation-ID check on hot path — every authz call looks up `CurrentGenerationId` (Polly-wrapped, sub-second cache); a Backup that cached a stale generation detects the mismatch + forces re-load. **Redundancy-safe**. |
|
||||
| `UserAuthorizationState` freshness | Cached per session BUT bounded by `MembershipFreshnessInterval` (default **15 min**). Past that, the next hot-path authz call re-resolves LDAP group memberships via `LdapGroupService`. Failure to re-resolve (LDAP unreachable) → **fail-closed**: evaluator returns `NotGranted` for every call until memberships refresh successfully. Decoupled from Phase 6.1's availability-oriented 24h cache. |
|
||||
| `AuthCacheMaxStaleness` | Separate from Phase 6.1's `UsingStaleConfig` window. Default 5 min — beyond that, authz fails closed regardless of Phase 6.1 cache warmth. |
|
||||
| OPC UA server dispatch — all enforcement surfaces | `DriverNodeManager` wires evaluator on: **Browse + TranslateBrowsePathsToNodeIds** (ancestors implicitly visible if any descendant has a grant; denied ancestors filter from results), **Read** (per-attribute StatusCode `BadUserAccessDenied` in mixed-authorization batches; batch never poisons), **Write** (uses `NodePermissions.WriteOperate/Tune/Configure` based on driver `SecurityClassification`), **HistoryRead** (uses `NodePermissions.HistoryRead` — **distinct** flag, not Read), **HistoryUpdate** (`NodePermissions.HistoryUpdate`), **CreateMonitoredItems** (per-`MonitoredItemCreateResult` denial), **TransferSubscriptions** (re-evaluates items on transfer), **Call** (`NodePermissions.MethodCall`), **Acknowledge/Confirm/Shelve** (per-alarm flags). |
|
||||
| Subscription re-authorization | Each `MonitoredItem` is stamped with `(AuthGenerationId, MembershipVersion)` at create time. On every Publish, items with a stamp mismatching the session's current `(AuthGenerationId, MembershipVersion)` get re-evaluated; revoked items drop to `BadUserAccessDenied` within one publish cycle. Unchanged items stay fast-path. |
|
||||
| `LdapAuthService` | On cookie-auth success: resolves LDAP group memberships; loads matching `LdapGroupRoleMapping` rows → role claims + cluster-scope claims (control plane); stores `UserAuthorizationState.LdapGroups` on the session for the data-plane evaluator. |
|
||||
| `ValidatedNodeAclAuthoringService` | Replaces CRUD-only `NodeAclService` for authoring. Validates (LDAP group exists, scope exists in current or target draft, grant shape is valid, no duplicate `(LdapGroup, Scope)` pair). Admin UI writes only through it. |
|
||||
| Admin UI `AclsTab.razor` | Writes via `ValidatedNodeAclAuthoringService`. Adds Probe-This-Permission row that runs the real evaluator against a chosen `(LDAP group, node, operation)` and shows `Allow` / `NotGranted` + matched-grant provenance. |
|
||||
| Admin UI new tab `RoleGrantsTab.razor` | CRUD over `LdapGroupRoleMapping`. Per-cluster + system-wide grants. FleetAdmin only. **Documentation explicit** that this only affects Admin UI access, not OPC UA data plane. |
|
||||
| Audit log | Every Grant/Revoke/Publish on `LdapGroupRoleMapping` or `NodeAcl` writes an `AuditLog` row with old/new state + user. |
|
||||
|
||||
## Scope — What Does NOT Change
|
||||
|
||||
| Item | Reason |
|
||||
|------|--------|
|
||||
| OPC UA authn | Already done (PR 19 LDAP user identity + Basic256Sha256 profile). Phase 6.2 is authorization only. |
|
||||
| Explicit `Deny` grants | Decision #129 note explicitly defers to v2.1. Default-deny + additive grants only. |
|
||||
| Driver-side `SecurityClassification` metadata | Drivers keep reporting `Operate` / `ViewOnly` / etc. — the evaluator uses them as *part* of the decision but doesn't replace them. |
|
||||
| Galaxy namespace (SystemPlatform kind) | UNS levels don't apply; evaluator treats Galaxy nodes as `Cluster → Namespace → Tag` (skip UnsArea/UnsLine/Equipment). |
|
||||
|
||||
## Entry Gate Checklist
|
||||
|
||||
- [ ] Phase 6.1 merged (reuse `Core.Resilience` Polly pipeline for the ACL cache-refresh retries)
|
||||
- [ ] `acl-design.md` re-read in full
|
||||
- [ ] Decision log #105, #129, corrections-doc B1 re-skimmed
|
||||
- [ ] Existing `NodeAcl` + `NodePermissions` flag enum audited; confirm bitmask flags match `acl-design.md` table
|
||||
- [ ] Existing `LdapAuthService` group-resolution code path traced end-to-end — confirm it already queries group memberships (we only need the caller to consume the result)
|
||||
- [ ] Test DB scenarios catalogued: two clusters, three LDAP groups per cluster, mixed grant shapes; captured as seed-data fixtures
|
||||
|
||||
## Task Breakdown
|
||||
|
||||
### Stream A — `LdapGroupRoleMapping` table + migration (3 days)
|
||||
|
||||
1. **A.1** Entity + EF Core migration. Columns per §Scope table. Unique constraint on `(LdapGroup, ClusterId)` with null-tolerant comparer for the system-wide case. Index on `LdapGroup` for the hot-path lookup on auth.
|
||||
2. **A.2** `ILdapGroupRoleMappingService` CRUD. Wrap in the Phase 6.1 Polly pipeline (timeout → retry → fallback-to-cache).
|
||||
3. **A.3** Seed-data migration: preserve the current hardcoded `FleetAdmin` / `ConfigEditor` / `ReadOnly` mappings by seeding rows for the existing LDAP groups the dev box uses (`cn=fleet-admin,…`, `cn=config-editor,…`, `cn=read-only,…`). Op no-op migration for existing deployments.
|
||||
|
||||
### Stream B — Permission-trie evaluator (1 week)
|
||||
|
||||
1. **B.1** `IPermissionEvaluator.Authorize(IEnumerable<Claim> identity, NodeId nodeId, NodePermissions needed)` — returns `bool`. Phase 6.2 returns only `true` / `false`; v2.1 can widen to `Allow`/`Deny`/`Indeterminate` if Deny lands.
|
||||
2. **B.2** `PermissionTrieBuilder` builds the trie from `NodeAcl` + `LdapGroupRoleMapping` joined to the current generation's `UnsArea` + `UnsLine` + `Equipment` + `Tag` tables. One trie per `(ClusterId, GenerationId)` so rollback doesn't smear permissions across generations.
|
||||
3. **B.3** Trie node structure: `{ Level: enum, ScopeId: Guid, AllowedPermissions: NodePermissions, ChildrenByLevel: Dictionary<Guid, TrieNode> }`. Evaluation walks from Cluster → Namespace → UnsArea → UnsLine → Equipment → Tag, ORing allowed permissions at each level. Additive semantics: a grant at Cluster level cascades to every descendant tag.
|
||||
4. **B.4** `PermissionTrieCache` service scoped as singleton; exposes `GetTrieAsync(ClusterId, ct)` that returns the current-generation trie. Invalidated on `sp_PublishGeneration` via an in-process event bus; also on TTL expiry (24 h safety net).
|
||||
5. **B.5** Per-session cached evaluator: OPC UA Session authentication produces `UserAuthorizationState { ClusterId, LdapGroups[], Trie }`; cached on the session until session close or generation-apply.
|
||||
6. **B.6** Unit tests: trie-walk theory covering (a) Cluster-level grant cascades to tags, (b) Equipment-level grant doesn't leak to sibling Equipment, (c) multi-group union, (d) no-grant → deny, (e) Galaxy nodes skip UnsArea/UnsLine levels.
|
||||
|
||||
### Stream C — OPC UA server dispatch wiring (6 days, widened)
|
||||
|
||||
1. **C.1** `DriverNodeManager.Read` — evaluator consulted per `ReadValueId` with `OpcUaOperation.Read`. Denied attributes get `BadUserAccessDenied` per-item; batch never poisons. Integration test covers mixed-authorization batch (3 authorized + 2 denied → 3 Good values + 2 Bad StatusCodes, request completes).
|
||||
2. **C.2** `DriverNodeManager.Write` — evaluator chooses `NodePermissions.WriteOperate` / `WriteTune` / `WriteConfigure` based on the driver-reported `SecurityClassification`.
|
||||
3. **C.3** `DriverNodeManager.HistoryRead` — **uses `NodePermissions.HistoryRead`**, which is a **distinct flag** from Read. Test: user with Read but not HistoryRead can read live values but gets `BadUserAccessDenied` on `HistoryRead`.
|
||||
4. **C.4** `DriverNodeManager.HistoryUpdate` — uses `NodePermissions.HistoryUpdate`.
|
||||
5. **C.5** `DriverNodeManager.CreateMonitoredItems` — per-`MonitoredItemCreateResult` denial in mixed-authorization batch; partial success path per OPC UA Part 4. Each created item stamped `(AuthGenerationId, MembershipVersion)`.
|
||||
6. **C.6** `DriverNodeManager.TransferSubscriptions` — on reconnect, re-evaluate every transferred `MonitoredItem` against the session's current auth state. Stale-stamp items drop to `BadUserAccessDenied`.
|
||||
7. **C.7** **Browse + TranslateBrowsePathsToNodeIds** — evaluator called with `OpcUaOperation.Browse`. Ancestor visibility implied when any descendant has a grant (per `acl-design.md` §Browse). Denied ancestors filter from browse results — the UA browser sees a hierarchy truncated at the denied ancestor rather than an inconsistent child-without-parent view.
|
||||
8. **C.8** `DriverNodeManager.Call` — `NodePermissions.MethodCall`.
|
||||
9. **C.9** Alarm actions (Acknowledge / Confirm / Shelve) — per-alarm `NodePermissions.AlarmAck` / `AlarmConfirm` / `AlarmShelve`.
|
||||
10. **C.10** Publish path — for each `MonitoredItem` with a mismatched `(AuthGenerationId, MembershipVersion)` stamp, re-evaluate. Unchanged items stay fast-path; changes happen at next publish cycle.
|
||||
11. **C.11** Integration tests: three-user seed with different memberships; matrix covers every operation in §Scope. Mixed-batch tests for Read + CreateMonitoredItems.
|
||||
|
||||
### Stream D — Admin UI refresh (4 days)
|
||||
|
||||
1. **D.1** `RoleGrantsTab.razor` — FleetAdmin-gated CRUD on `LdapGroupRoleMapping`. Per-cluster dropdown + system-wide checkbox. Validation: LDAP group must exist in the dev LDAP (GLAuth) before saving — best-effort probe with graceful degradation.
|
||||
2. **D.2** `AclsTab.razor` rewrites its edit path to write through the new `NodeAclService`. Adds a "Probe this permission" row: choose `(LDAP group, node, action)` → shows Allow / Deny + the reason (which grant matched).
|
||||
3. **D.3** Draft-generation diff viewer now includes an ACL section: "X grants added, Y grants removed, Z grants changed."
|
||||
4. **D.4** SignalR notification: `PermissionTrieCache` invalidation on `sp_PublishGeneration` pushes to Admin UI so operators see "this clusters permissions were just updated" within 2 s.
|
||||
|
||||
## Compliance Checks (run at exit gate)
|
||||
|
||||
- [ ] **Control/data-plane separation**: `LdapGroupRoleMapping` consumed only by Admin UI; the data-path evaluator has zero references to it. Enforced via a project-reference audit (Admin project references the mapping service; `Core.Authorization` does not).
|
||||
- [ ] **Every operation wired**: Browse, Read, Write, HistoryRead, HistoryUpdate, CreateMonitoredItems, TransferSubscriptions, Call, Acknowledge, Confirm, Shelve all consult the evaluator. Integration test matrix covers every operation × allow/deny.
|
||||
- [ ] **HistoryRead uses its own flag**: test "user with Read + no HistoryRead gets `BadUserAccessDenied` on HistoryRead".
|
||||
- [ ] **Mixed-batch semantics**: Read of 5 nodes (3 allowed + 2 denied) returns 3 Good + 2 `BadUserAccessDenied` per-`ReadValueId`; CreateMonitoredItems equivalent.
|
||||
- [ ] **Browse ancestor visibility**: user with a grant only on a deep equipment node can browse the path to it (ancestors implied); denied ancestors filter from browse results otherwise.
|
||||
- [ ] **Galaxy FolderSegment coverage**: a grant on a Galaxy folder subtree cascades to its tags; sibling folders are unaffected. Trie test covers this.
|
||||
- [ ] **Subscription re-authorization**: integration test — create item, revoke grant via draft+publish, next publish cycle the item returns `BadUserAccessDenied` (not silently still-notifying).
|
||||
- [ ] **Membership freshness**: test — 15 min MembershipFreshnessInterval elapses on a long-lived session + LDAP now unreachable → authz fails closed on the next request until LDAP recovers.
|
||||
- [ ] **Auth cache fail-closed**: test — Phase 6.1 cache serves stale config for 6 min; authz evaluator refuses all calls after 5 min regardless.
|
||||
- [ ] **Trie invariants**: `PermissionTrieBuilder` is idempotent (build twice with identical inputs → equal tries).
|
||||
- [ ] **Additive grants + cluster isolation**: cluster-grant cascades; cross-cluster leakage impossible.
|
||||
- [ ] **Redundancy-safe invalidation**: integration test — two nodes, a publish on one, authorize a request on the other before in-process event propagates → generation-mismatch forces re-load, no stale decision.
|
||||
- [ ] **Authoring validation**: `AclsTab` cannot save a `(LdapGroup, Scope)` pair that already exists in the draft; operator sees the validation error pre-save.
|
||||
- [ ] **AuthorizationDecision shape stability**: API surface exposes `Allow` + `NotGranted` only; `Denied` variant exists in the type but is never produced; v2.1 can add Deny without API break.
|
||||
- [ ] No regression in driver test counts.
|
||||
|
||||
## Risks and Mitigations
|
||||
|
||||
| Risk | Likelihood | Impact | Mitigation |
|
||||
|------|:----------:|:------:|------------|
|
||||
| ACL evaluator latency on per-read hot path | Medium | High | Trie lookup is O(depth) = O(6); session-cached UserAuthorizationState avoids per-Read trie rebuild; benchmark in Stream B.6 |
|
||||
| Trie cache stale after a rollback | Medium | High | `sp_PublishGeneration` + `sp_RollbackGeneration` both emit the invalidation event; trie keyed on `(ClusterId, GenerationId)` so rollback fetches the prior trie cleanly |
|
||||
| `BadUserAccessDenied` returns expose sensitive browse-name metadata | Low | Medium | Server returns only the status code + NodeId; no message leak per OPC UA Part 4 §7.34 guidance |
|
||||
| LdapGroupRoleMapping migration breaks existing deployments | Low | High | Seed-migration preserves the hardcoded groups' effective grants verbatim; smoke test exercises the post-migration fleet admin login |
|
||||
| Deny semantics accidentally ship (would break `acl-design.md` defer) | Low | Medium | `IPermissionEvaluator.Authorize` returns `bool` (not tri-state) through Phase 6.2; widening to `Allow`/`Deny`/`Indeterminate` is a v2.1 ticket |
|
||||
|
||||
## Completion Checklist
|
||||
|
||||
- [ ] Stream A: `LdapGroupRoleMapping` entity + migration + CRUD + seed
|
||||
- [ ] Stream B: evaluator + trie builder + cache + per-session state + unit tests
|
||||
- [ ] Stream C: OPC UA dispatch wiring on Read/Write/HistoryRead/Subscribe/Alarm paths
|
||||
- [ ] Stream D: Admin UI `RoleGrantsTab` + `AclsTab` refresh + SignalR invalidation
|
||||
- [ ] `phase-6-2-compliance.ps1` exits 0; exit-gate doc recorded
|
||||
|
||||
## Adversarial Review — 2026-04-19 (Codex, thread `019da48d-0d2b-7171-aed2-fc05f1f39ca3`)
|
||||
|
||||
1. **Crit · ACCEPT** — Trie must not conflate `LdapGroupRoleMapping` (control-plane admin claims per decision #105) with data-plane ACLs (decision #129). **Change**: `LdapGroupRoleMapping` is consumed only by the Admin UI role router. Data-plane trie reads `NodeAcl` rows joined against the session's **resolved LDAP groups**, never admin roles. Stream B.2 updated.
|
||||
2. **Crit · ACCEPT** — Cached `UserAuthorizationState` survives LDAP group changes because memberships only refresh at cookie-auth. Change: add `MembershipFreshnessInterval` (default 15 min); past that, next hot-path authz call forces group re-resolution (fail-closed if LDAP unreachable). Session-close-wins on config-rollback.
|
||||
3. **High · ACCEPT** — Node-local invalidation doesn't extend across redundant pair. **Change**: trie keyed on `(ClusterId, GenerationId)`; hot-path authz looks up `CurrentGenerationId` from the shared config DB (Polly-wrapped + sub-second cache). A Backup that read stale generation gets a mismatched trie → forces re-load. Implementation note added to Stream B.4.
|
||||
4. **High · ACCEPT** — Browse enforcement missing. **Change**: new Stream C.7 (`Browse + TranslateBrowsePathsToNodeIds` enforcement). Ancestor visibility implied when any descendant has a grant; denied ancestors filter from browse results per `acl-design.md` §Browse.
|
||||
5. **High · ACCEPT** — `HistoryRead` should use `NodePermissions.HistoryRead` bit, not `Read`. **Change**: Stream C.3 revised; separate unit test asserts `Read+no-HistoryRead` denies HistoryRead while allowing current-value reads.
|
||||
6. **High · ACCEPT** — Galaxy shallow-path (Cluster→Namespace→Tag) loses folder hierarchy authorization. **Change**: SystemPlatform namespaces use a `FolderSegment` scope-level between Namespace and Tag, populated from `Tag.FolderPath`; UNS-kind namespaces keep the 6-level hierarchy. Trie supports both via `ScopeKind` on each node.
|
||||
7. **High · ACCEPT** — Subscription re-authorization policy unresolved between create-time-only (fast, wrong on revoke) and per-publish (slow). **Change**: stamp each `MonitoredItem` with `(AuthGenerationId, MembershipVersion)`; re-evaluate on Publish only when either version changed. Revoked items drop to `BadUserAccessDenied` within one publish cycle.
|
||||
8. **Med · ACCEPT** — Mixed-authorization batch `Read` / `CreateMonitoredItems` service-result semantics underspecified. **Change**: Stream C.6 explicitly tests per-`ReadValueId` + per-`MonitoredItemCreateResult` denial in mixed batches; batch never collapses to a coarse failure.
|
||||
9. **Med · ACCEPT** — Missing surfaces: `Method.Call`, `HistoryUpdate`, event filter on subscriptions, subscription-transfer on reconnect, alarm-ack. **Change**: scope expanded — every OPC UA authorization surface enumerated in Stream C: Read, Write, HistoryRead, HistoryUpdate, CreateMonitoredItems, TransferSubscriptions, Call, Acknowledge/Confirm/Shelve, Browse, TranslateBrowsePathsToNodeIds.
|
||||
10. **Med · ACCEPT** — `bool` evaluator bakes in grant-only semantics; collides with v2.1 Deny. **Change**: internal model uses `AuthorizationDecision { Allow | NotGranted | Denied, IReadOnlyList<MatchedGrant> Provenance }`. Phase 6.2 maps `Denied` → never produced; UI + audit log use the full record so v2.1 Deny lands without API break.
|
||||
11. **Med · ACCEPT** — 6.1 cache fallback is availability-oriented; applying it to auth is correctness-dangerous. **Change**: auth-specific staleness budget `AuthCacheMaxStaleness` (default 5 min, not 24 h). Past that, hot-path evaluator fails closed on cached reads; all authorization calls return `NotGranted` until fresh data lands. Documented in risks + compliance.
|
||||
12. **Low · ACCEPT** — Existing `NodeAclService` is raw CRUD. **Change**: new `ValidatedNodeAclAuthoringService` enforces scope-uniqueness + draft/publish invariants + rejects invalid (LDAP group, scope) pairs; Admin UI writes through it only. Stream D.2 adjusted.
|
||||
|
||||
150
docs/v2/implementation/phase-6-3-redundancy-runtime.md
Normal file
150
docs/v2/implementation/phase-6-3-redundancy-runtime.md
Normal file
@@ -0,0 +1,150 @@
|
||||
# Phase 6.3 — Redundancy Runtime
|
||||
|
||||
> **Status**: DRAFT — `CLAUDE.md` + `docs/Redundancy.md` describe a non-transparent warm/hot redundancy model with unique ApplicationUris, `RedundancySupport` advertisement, `ServerUriArray`, and dynamic `ServiceLevel`. Entities (`ServerCluster`, `ClusterNode`, `RedundancyRole`, `RedundancyMode`) exist; the runtime behavior (actual `ServiceLevel` number computation, mid-apply dip, `ServerUriArray` broadcast) is not wired.
|
||||
>
|
||||
> **Branch**: `v2/phase-6-3-redundancy-runtime`
|
||||
> **Estimated duration**: 2 weeks
|
||||
> **Predecessor**: Phase 6.2 (Authorization) — reuses the Phase 6.1 health endpoints for cluster-peer probing
|
||||
> **Successor**: Phase 6.4 (Admin UI completion)
|
||||
|
||||
## Phase Objective
|
||||
|
||||
Land the non-transparent redundancy protocol end-to-end: two `OtOpcUa.Server` instances in a `ServerCluster` each expose a live `ServiceLevel` node whose value reflects that instance's suitability to serve traffic, advertise each other via `ServerUriArray`, and transition role (Primary ↔ Backup) based on health + operator intent.
|
||||
|
||||
Closes these gaps:
|
||||
|
||||
1. **Dynamic `ServiceLevel`** — OPC UA Part 5 §6.3.34 specifies a Byte (0..255) that clients poll to pick the healthiest server. Our server publishes it as a static value today.
|
||||
2. **`ServerUriArray` broadcast** — Part 4 specifies that every node in a redundant pair should advertise its peers' ApplicationUris. Currently advertises only its own.
|
||||
3. **Primary / Backup role coordination** — entities carry `RedundancyRole` but the runtime doesn't read it; no peer health probing; no role-transfer on primary failure.
|
||||
4. **Mid-apply dip** — decision-level expectation that a server mid-generation-apply should report a *lower* ServiceLevel so clients cut over to the peer during the apply window. Not implemented.
|
||||
|
||||
## Scope — What Changes
|
||||
|
||||
| Concern | Change |
|
||||
|---------|--------|
|
||||
| `OtOpcUa.Server` → new `Server.Redundancy` sub-namespace | `RedundancyCoordinator` singleton. Resolves the current node's `ClusterNode` row at startup, loads peers, runs **two-layer peer health probe**: (a) `/healthz` every 2 s as the fast-fail (inherits Phase 6.1 semantics — HTTP + DB/cache healthy); (b) `UaHealthProbe` every 10 s — opens a lightweight OPC UA client session to the peer + reads its `ServiceLevel` node + verifies endpoint serves data. Authority decisions use UaHealthProbe; `/healthz` is used only to avoid wasting UA probes when peer is obviously down. |
|
||||
| Publish-generation fencing | Topology + role decisions are stamped with a monotonic `ConfigGenerationId` from the shared config DB. Coordinator re-reads topology via CAS on `(ClusterId, ExpectedGeneration)` → new row; peers reject state propagated from a lower generation. Prevents split-publish races. |
|
||||
| `InvalidTopology` runtime state | If both nodes detect >1 Primary AFTER startup (config-DB drift during a publish), both self-demote to ServiceLevel 2 until convergence. Neither node serves authoritatively; clients pick the healthier alternative or reconnect later. |
|
||||
| OPC UA server root | `ServiceLevel` variable node becomes a `BaseDataVariable` whose value updates on `RedundancyCoordinator` state change. `ServerUriArray` array variable includes **self + peers** in stable deterministic ordering (decision per OPC UA Part 4 §6.6.2.2). `RedundancySupport` stays static (set from `RedundancyMode` at startup); `Transparent` mode validated pre-publish, not rejected at startup. |
|
||||
| `RedundancyCoordinator` computation | **8-state ServiceLevel matrix** — avoids OPC UA Part 5 §6.3.34 collision (`0=Maintenance`, `1=NoData`). Operator-declared maintenance only = **0**. Unreachable / Faulted = **1**. In-range operational states occupy **2..255**: Authoritative-Primary = **255**; Isolated-Primary (peer unreachable, self serving) = **230**; Primary-Mid-Apply = **200**; Recovering-Primary (post-fault, dwell not met) = **180**; Authoritative-Backup = **100**; Isolated-Backup (primary unreachable, "take over if asked") = **80**; Backup-Mid-Apply = **50**; Recovering-Backup = **30**; `InvalidTopology` (runtime detects >1 Primary) = **2** (detected-inconsistency band — below normal operation). Full matrix documented in `docs/Redundancy.md` update. |
|
||||
| Role transition | Split-brain avoidance: role is *declared* in the shared config DB (`ClusterNode.RedundancyRole`), not elected at runtime. An operator flips the row (or a failover script does). Coordinator only reads; never writes. |
|
||||
| `sp_PublishGeneration` hook | Uses named **apply leases** keyed to `(ConfigGenerationId, PublishRequestId)`. `await using var lease = coordinator.BeginApplyLease(...)`. Disposal on any exit path (success, exception, cancellation) decrements. Watchdog auto-closes any lease older than `ApplyMaxDuration` (default 10 min) → ServiceLevel can't stick at mid-apply. Pre-publish validator rejects unsupported `RedundancyMode` (e.g. `Transparent`) with a clear error so runtime never sees an invalid state. |
|
||||
| Admin UI `/cluster/{id}` page | New `RedundancyTab.razor` — shows current node's role + ServiceLevel + peer reachability. FleetAdmin can trigger a role-swap by editing `ClusterNode.RedundancyRole` + publishing a draft. |
|
||||
| Metrics | New OpenTelemetry metrics: `ot_opcua_service_level{cluster,node}`, `ot_opcua_peer_reachable{cluster,node,peer}`, `ot_opcua_apply_in_progress{cluster,node}`. Sink via Phase 6.1 observability layer. |
|
||||
|
||||
## Scope — What Does NOT Change
|
||||
|
||||
| Item | Reason |
|
||||
|------|--------|
|
||||
| OPC UA authn / authz | Phases 6.2 + prior. Redundancy is orthogonal. |
|
||||
| Driver layer | Drivers aren't redundancy-aware; they run on each node independently against the same equipment. The server layer handles the ServiceLevel story. |
|
||||
| Automatic failover / election | Explicitly out of scope. Non-transparent = client picks which server to use via ServiceLevel + ServerUriArray. We do NOT ship consensus, leader election, or automatic promotion. Operator-driven failover is the v2.0 model per decision #79–85. |
|
||||
| Transparent redundancy (`RedundancySupport=Transparent`) | Not supported. If the operator asks for it the server fails startup with a clear error. |
|
||||
| Historian redundancy | Galaxy Historian's own redundancy (two historians on two CPUs) is out of scope. The Galaxy driver talks to whichever historian is reachable from its node. |
|
||||
|
||||
## Entry Gate Checklist
|
||||
|
||||
- [ ] Phase 6.1 merged (uses `/healthz` for peer probing)
|
||||
- [ ] `CLAUDE.md` §Redundancy + `docs/Redundancy.md` re-read
|
||||
- [ ] Decisions #79–85 re-skimmed
|
||||
- [ ] `ServerCluster`/`ClusterNode`/`RedundancyRole`/`RedundancyMode` entities + existing migration reviewed
|
||||
- [ ] OPC UA Part 4 §Redundancy + Part 5 §6.3.34 (ServiceLevel) re-skimmed
|
||||
- [ ] Dev box has two OtOpcUa.Server instances configured against the same cluster — one designated Primary, one Backup — for integration testing
|
||||
|
||||
## Task Breakdown
|
||||
|
||||
### Stream A — Cluster topology loader (3 days)
|
||||
|
||||
1. **A.1** `RedundancyCoordinator` startup path: reads `ClusterNode` row for the current node (identified by `appsettings.json` `Cluster:NodeId`), reads the cluster's peer list, validates invariants (no duplicate `ApplicationUri`, at most one `Primary` per cluster if `RedundancyMode.WarmActive`, at most two nodes total in v2.0 per decision #83).
|
||||
2. **A.2** Topology subscription — coordinator re-reads on `sp_PublishGeneration` confirmation so an operator role-swap takes effect after publish (no process restart needed).
|
||||
3. **A.3** Tests: two-node cluster seed, one-node cluster seed (degenerate), duplicate-uri rejection.
|
||||
|
||||
### Stream B — Peer health probing + ServiceLevel computation (6 days, widened)
|
||||
|
||||
1. **B.1** `PeerHttpProbeLoop` per peer at 2 s — calls peer's `/healthz`, 1 s timeout, exponential backoff on sustained failure. Used as fast-fail.
|
||||
2. **B.2** `PeerUaProbeLoop` per peer at 10 s — opens an OPC UA client session to the peer (reuses Phase 5 `Driver.OpcUaClient` stack), reads peer's `ServiceLevel` node + verifies endpoint serves data. Short-circuit: if HTTP probe is failing, skip UA probe (no wasted sessions).
|
||||
3. **B.3** `ServiceLevelCalculator.Compute(role, selfHealth, peerHttpHealthy, peerUaHealthy, applyInProgress, recoveryDwellMet, topologyValid) → byte`. 8-state matrix per §Scope. `topologyValid=false` forces InvalidTopology = 2 regardless of other inputs.
|
||||
4. **B.4** `RecoveryStateManager`: after a `Faulted → Healthy` transition, hold driver in `Recovering` band (180 Primary / 30 Backup) for `RecoveryDwellTime` (default 60 s) AND require one positive publish witness (successful `Read` on a reference node) before entering Authoritative band.
|
||||
5. **B.5** Calculator reacts to inputs via `IObserver` so changes immediately push to the OPC UA `ServiceLevel` node.
|
||||
6. **B.6** Tests: **64-case matrix** covering role × self-health × peer-http × peer-ua × apply × recovery × topology. Specific cases flagged: Primary-with-unreachable-peer-serves-at-230 (authority retained); Backup-with-unreachable-primary-escalates-to-80 (not auto-promote); InvalidTopology demotes both nodes; Recovering dwell + publish-witness blocks premature return to 255.
|
||||
|
||||
### Stream C — OPC UA node wiring (3 days)
|
||||
|
||||
1. **C.1** `ServiceLevel` variable node created under `ServerStatus` at server startup. Type `Byte`, AccessLevel = CurrentRead only. Subscribe to `ServiceLevelCalculator` observable; push updates via `DataChangeNotification`.
|
||||
2. **C.2** `ServerUriArray` variable node under `ServerCapabilities`. Array of `String`, **includes self + peers** with deterministic ordering (self first). Updates on topology change. Compliance test asserts local-plus-peer membership.
|
||||
3. **C.3** `RedundancySupport` variable — static at startup from `RedundancyMode`. Values: `None`, `Cold`, `Warm`, `WarmActive`, `Hot`. Unsupported values (`Transparent`, `HotAndMirrored`) are rejected **pre-publish** by validator — runtime never sees them.
|
||||
4. **C.4** Client.CLI cutover test: connect to primary, read `ServiceLevel` → 255; pause primary apply → 200; unreachable peer while apply in progress → 200 (apply dominates peer-unreachable per matrix); client sees peer via `ServerUriArray`; fail primary → client reconnects to peer at 80 (isolated-backup band).
|
||||
|
||||
### Stream D — Apply-window integration (3 days)
|
||||
|
||||
1. **D.1** `sp_PublishGeneration` caller wraps the apply in `await using var lease = coordinator.BeginApplyLease(generationId, publishRequestId)`. Lease keyed to `(ConfigGenerationId, PublishRequestId)` so concurrent publishes stay isolated. Disposal decrements on every exit path.
|
||||
2. **D.2** `ApplyLeaseWatchdog` auto-closes leases older than `ApplyMaxDuration` (default 10 min) so a crashed publisher can't pin the node at mid-apply.
|
||||
3. **D.3** Pre-publish validator in `sp_PublishGeneration` rejects unsupported `RedundancyMode` values (`Transparent`, `HotAndMirrored`) with a clear error message — runtime never sees an invalid mode.
|
||||
4. **D.4** Tests: (a) mid-apply client subscribes → sees ServiceLevel drop → sees restore; (b) lease leak via `ThreadAbort` / cancellation → watchdog closes; (c) publish rejected for `Transparent` → operator-actionable error.
|
||||
|
||||
### Stream E — Admin UI + metrics (3 days)
|
||||
|
||||
1. **E.1** `RedundancyTab.razor` under `/cluster/{id}/redundancy`. Shows each node's role, current ServiceLevel (with band label per 8-state matrix), peer reachability (HTTP + UA probe separately), last apply timestamp. Role-swap button posts a draft edit on `ClusterNode.RedundancyRole`; publish applies.
|
||||
2. **E.2** OpenTelemetry meter export: `ot_opcua_service_level{cluster,node}` gauge + `ot_opcua_peer_reachable{cluster,node,peer,kind=http|ua}` + `ot_opcua_apply_in_progress{cluster,node}` + `ot_opcua_topology_valid{cluster}`. Sink via Phase 6.1 observability.
|
||||
3. **E.3** SignalR push: `FleetStatusHub` broadcasts ServiceLevel changes so the Admin UI updates within ~1 s of the coordinator observing a peer flip.
|
||||
|
||||
### Stream F — Client-interoperability matrix (3 days, new)
|
||||
|
||||
1. **F.1** Validate ServiceLevel-driven cutover against **Ignition 8.1 + 8.3**, **Kepware KEPServerEX 6.x**, **Aveva OI Gateway 2020R2 + 2023R1**. For each: configure the client with both endpoints, verify it honors `ServiceLevel` + `ServerUriArray` during primary failover.
|
||||
2. **F.2** Clients that don't honour the standards (doc field — may include Kepware and OI Gateway per Codex review) get an explicit compatibility-matrix entry: "requires manual backup-endpoint config / vendor-specific redundancy primitives". Documented in `docs/Redundancy.md`.
|
||||
3. **F.3** Galaxy MXAccess failover test — boot Galaxy.Proxy on both nodes, kill Primary, assert Galaxy consumer reconnects to Backup within `(SessionTimeout + KeepAliveInterval × 3)`. Document required session-timeout config in `docs/Redundancy.md`.
|
||||
|
||||
## Compliance Checks (run at exit gate)
|
||||
|
||||
- [ ] **OPC UA band compliance**: `0=Maintenance` reserved, `1=NoData` reserved. Operational states in 2..255 per 8-state matrix.
|
||||
- [ ] **Authoritative-Primary** ServiceLevel = 255.
|
||||
- [ ] **Isolated-Primary** (peer unreachable, self serving) = 230 — Primary retains authority.
|
||||
- [ ] **Primary-Mid-Apply** = 200.
|
||||
- [ ] **Recovering-Primary** = 180 with dwell + publish witness enforced.
|
||||
- [ ] **Authoritative-Backup** = 100.
|
||||
- [ ] **Isolated-Backup** (primary unreachable) = 80 — does NOT auto-promote.
|
||||
- [ ] **InvalidTopology** = 2 — both nodes self-demote when >1 Primary detected runtime.
|
||||
- [ ] **ServerUriArray** returns self + peer URIs, self first.
|
||||
- [ ] **UaHealthProbe authority**: integration test — peer returns HTTP 200 but OPC UA endpoint unreachable → coordinator treats peer as UA-unhealthy; peer is not a valid authority source.
|
||||
- [ ] **Apply-lease disposal**: leases close on exception, cancellation, and watchdog timeout; ServiceLevel never sticks at mid-apply band.
|
||||
- [ ] **Transparent-mode rejection**: attempting to publish `RedundancyMode=Transparent` is blocked at `sp_PublishGeneration`; runtime never sees an invalid mode.
|
||||
- [ ] **Role transition via operator publish**: FleetAdmin swaps `RedundancyRole` in a draft, publishes; both nodes re-read topology on publish confirmation + flip ServiceLevel — no restart.
|
||||
- [ ] **Client.CLI cutover**: with primary halted, Client.CLI that was connected to primary sees primary drop + reconnects to backup via `ServerUriArray`.
|
||||
- [ ] **Client interoperability matrix** (Stream F): Ignition 8.1 + 8.3 honour ServiceLevel; Kepware + Aveva OI Gateway findings documented.
|
||||
- [ ] **Galaxy MXAccess failover**: end-to-end test — primary kill → Galaxy consumer reconnects to backup within session-timeout budget.
|
||||
- [ ] No regression in existing driver test suites; no regression in `/healthz` reachability under redundancy load.
|
||||
|
||||
## Risks and Mitigations
|
||||
|
||||
| Risk | Likelihood | Impact | Mitigation |
|
||||
|------|:----------:|:------:|------------|
|
||||
| Split-brain from operator race (both nodes marked Primary) | Low | High | Coordinator rejects startup if its cluster has >1 Primary row; logs + fails fast. Document as a publish-time validation in `sp_PublishGeneration`. |
|
||||
| ServiceLevel thrashing on flaky peer | Medium | Medium | 2 s probe interval + 3-sample smoothing window; only declares a peer unreachable after 3 consecutive failed probes |
|
||||
| Client ignores ServiceLevel and stays on broken primary | Medium | Medium | Documented in `docs/Redundancy.md` — non-transparent redundancy requires client cooperation; most SCADA clients (Ignition, Kepware, Aveva OI Gateway) honor it. Unit-test the advertised values; field behavior is client-responsibility |
|
||||
| Apply-window counter leaks on exception | Low | High | `BeginApplyWindow` returns `IDisposable`; `using` syntax enforces paired decrement; unit test for exception-in-apply path |
|
||||
| `HttpClient` probe leaks sockets | Low | Medium | Single shared `HttpClient` per coordinator (not per-probe); timeouts tight to avoid keeping connections open during peer downtime |
|
||||
|
||||
## Completion Checklist
|
||||
|
||||
- [ ] Stream A: topology loader + tests
|
||||
- [ ] Stream B: peer probe + ServiceLevel calculator + 32-case matrix tests
|
||||
- [ ] Stream C: ServiceLevel / ServerUriArray / RedundancySupport node wiring + Client.CLI smoke test
|
||||
- [ ] Stream D: apply-window integration + nested-apply counter
|
||||
- [ ] Stream E: Admin `RedundancyTab` + OpenTelemetry metrics + SignalR push
|
||||
- [ ] `phase-6-3-compliance.ps1` exits 0; exit-gate doc; `docs/Redundancy.md` updated with the ServiceLevel matrix
|
||||
|
||||
## Adversarial Review — 2026-04-19 (Codex, thread `019da490-3fa0-7340-98b8-cceeca802550`)
|
||||
|
||||
1. **Crit · ACCEPT** — No publish-generation fencing enables split-publish advertising both as authoritative. **Change**: coordinator CAS on a monotonic `ConfigGenerationId`; every topology decision is generation-stamped; peers reject state propagated from a lower generation.
|
||||
2. **Crit · ACCEPT** — `>1 Primary` at startup covered but runtime containment missing when invalid topology appears later (mid-apply race). **Change**: add runtime `InvalidTopology` state — both nodes self-demote to ServiceLevel 2 (the "detected inconsistency" band, below normal operation) until convergence.
|
||||
3. **High · ACCEPT** — `0 = Faulted` collides with OPC UA Part 5 §6.3.34 semantics where 0 means **Maintenance** and 1 means NoData. **Change**: reserve **0** for operator-declared maintenance-mode only; Faulted/unreachable uses **1** (NoData); in-range degraded states occupy 2..199.
|
||||
4. **High · ACCEPT** — Matrix collapses distinct operational states onto the same value. **Change**: matrix expanded to Authoritative-Primary=255, Isolated-Primary=230 (peer unreachable — still serving), Primary-Mid-Apply=200, Recovering-Primary=180, Authoritative-Backup=100, Isolated-Backup=80 (primary unreachable — "take over if asked"), Backup-Mid-Apply=50, Recovering-Backup=30.
|
||||
5. **High · ACCEPT** — `/healthz` from 6.1 is HTTP-healthy but doesn't guarantee OPC UA data plane. **Change**: add a redundancy-specific probe `UaHealthProbe` — issues a `ReadAsync(ServiceLevel)` against the peer's OPC UA endpoint via a lightweight client session. `/healthz` remains the fast-fail; the UA probe is the authority signal.
|
||||
6. **High · ACCEPT** — `ServerUriArray` must include self + peers, not peers only. **Change**: array contains `[self.ApplicationUri, peer.ApplicationUri]` in stable deterministic ordering; compliance test asserts local-plus-peer membership.
|
||||
7. **Med · ACCEPT** — No `Faulted → Recovering → Healthy` path. **Change**: add `Recovering` state with min dwell time (60 s default) + positive publish witness (one successful Read on a reference node) before returning to Healthy. Thrash-prevention.
|
||||
8. **Med · ACCEPT** — Topology change during in-flight probe undefined. **Change**: every probe task tagged with `ConfigGenerationId` at dispatch; obsolete results discarded; in-flight probes cancelled on topology reload.
|
||||
9. **Med · ACCEPT** — Apply-window counter race on exception/cancellation/async ownership. **Change**: apply-window is a named lease keyed to `(ConfigGenerationId, PublishRequestId)` with disposal enforced via `await using`; watchdog detects leased-but-abandoned and force-closes after `ApplyMaxDuration` (default 10 min).
|
||||
10. **High · ACCEPT** — Ignition + Kepware + Aveva OI Gateway `ServiceLevel` compliance is unverified. **Change**: risk elevated to High; add Stream F (new) — build an interop matrix: validate against Ignition 8.1/8.3, Kepware KEPServerEX 6.x, Aveva OI Gateway 2020R2 + 2023R1. Document per-client cutover behaviour. Field deployments get a documented compatibility table; clients that ignore ServiceLevel documented as requiring explicit backup-endpoint config.
|
||||
11. **Med · ACCEPT** — Galaxy MXAccess re-session on Primary death not in acceptance. **Change**: Stream F adds an end-to-end failover smoke test that boots Galaxy.Proxy on both nodes, kills Primary, asserts Galaxy consumer reconnects to Backup within `(SessionTimeout + KeepAliveInterval × 3)` budget. `docs/Redundancy.md` updated with required session timeouts.
|
||||
12. **Med · ACCEPT** — Transparent-mode startup rejection is outage-prone. **Change**: `sp_PublishGeneration` validates `RedundancyMode` pre-publish — unsupported values reject the publish attempt with a clear validation error; runtime never sees an unsupported mode. Last-good config stays active.
|
||||
|
||||
134
docs/v2/implementation/phase-6-4-admin-ui-completion.md
Normal file
134
docs/v2/implementation/phase-6-4-admin-ui-completion.md
Normal file
@@ -0,0 +1,134 @@
|
||||
# Phase 6.4 — Admin UI Completion
|
||||
|
||||
> **Status**: DRAFT — Phase 1 Stream E shipped the Admin scaffold + core pages; several feature-completeness items from its completion checklist (`phase-1-configuration-and-admin-scaffold.md` §Stream E) never landed. This phase closes them.
|
||||
>
|
||||
> **Branch**: `v2/phase-6-4-admin-ui-completion`
|
||||
> **Estimated duration**: 2 weeks
|
||||
> **Predecessor**: Phase 6.3 (Redundancy runtime) — reuses the `/cluster/{id}` page layout for the new tabs
|
||||
> **Successor**: v2 release-readiness capstone (Task #121)
|
||||
|
||||
## Phase Objective
|
||||
|
||||
Close the Admin UI feature-completeness checklist that Phase 1 Stream E exit gate left open. Each item below is an existing `phase-1-configuration-and-admin-scaffold.md` completion-checklist entry that is currently unchecked.
|
||||
|
||||
Gaps to close:
|
||||
|
||||
1. **UNS Structure tab drag/move with impact preview** — decision #115 + `admin-ui.md` §"UNS". Current state: list-only render; no drag reorder; no "X lines / Y equipment impacted" preview.
|
||||
2. **Equipment CSV import + 5-identifier search** — decision #95 + #117. Current state: basic form; no CSV parser; search indexes only ZTag.
|
||||
3. **Draft-generation diff viewer** — enhance existing `DiffViewer.razor` to show generation-diff not just staged-edit diff; highlight ACL grant changes (lands after Phase 6.2).
|
||||
4. **`_base` equipment-class Identification fields exposure** — decision #138–139. Columns exist on `Equipment`; no Admin UI field group; no address-space exposure of the OPC 40010 sub-folder.
|
||||
|
||||
## Scope — What Changes
|
||||
|
||||
| Concern | Change |
|
||||
|---------|--------|
|
||||
| `Admin/Pages/UnsTab.razor` | Tree component with drag-drop using **`MudBlazor.TreeView` + `MudBlazor.DropTarget`** (existing transitive dep — no new third-party package). Native HTML5 DnD rejected because virtualization + DnD on 500+ nodes doesn't combine reliably. Each drag fires a "Compute Impact" call carrying a `DraftRevisionToken`; modal preview ("Moving Line 'Oven-2' from 'Packaging' to 'Assembly' will re-home 14 equipment + re-parent 237 tags"). **Confirm step re-checks the token** and rejects with a `409 Conflict / refresh-required` modal if the draft advanced between preview and commit. |
|
||||
| `Admin/Services/UnsImpactAnalyzer.cs` | New service. Given a move-operation (line move, area rename, line merge), computes cascade counts + `DraftRevisionToken` at preview time. Pure-function shape; testable in isolation. |
|
||||
| `Admin/Pages/EquipmentTab.razor` | Add CSV-import button → modal with file picker + dry-run preview. **Identifier search** uses the canonical decision #117 set: `ZTag / MachineCode / SAPID / EquipmentId / EquipmentUuid`. Typeahead probes each column with a ranking query (exact match score 100 → prefix 50 → opt-in LIKE 20; published > draft tie-break). Result row shows which field matched via trailing badge. |
|
||||
| `Admin/Services/EquipmentCsvImporter.cs` | New service. CSV header row must start with `# OtOpcUaCsv v1` (version marker — future shape changes bump the version). Columns: `ZTag, MachineCode, SAPID, EquipmentId, EquipmentUuid, Name, UnsAreaName, UnsLineName, Manufacturer, Model, SerialNumber, HardwareRevision, SoftwareRevision, YearOfConstruction, AssetLocation, ManufacturerUri, DeviceManualUri`. Parser rejects unknown columns + blank required fields + duplicate ZTags + missing UnsLines. |
|
||||
| **Staged-import table** `EquipmentImportBatch` | New entity `{ Id, CreatedAtUtc, CreatedBy, RowsStaged, RowsAccepted, RowsRejected, FinalisedAtUtc? }` + child `EquipmentImportRow` records. Import writes rows in chunks to the staging table (not to `Equipment`). `FinaliseImportBatch` is the atomic finalize step that applies all accepted rows to `Equipment` + `ExternalIdReservation` in one transaction — short + bounded regardless of input size. Rollback = drop the batch row; `Equipment` never partially mutates. |
|
||||
| `Admin/Pages/DraftEditor.razor` + `DiffViewer.razor` | Diff viewer refactored into a base component + section plugins: `StructuralDiffSection`, `EquipmentDiffSection`, `TagDiffSection`, `AclDiffSection` (Phase 6.2), `RedundancyDiffSection` (Phase 6.3), `IdentificationDiffSection`. Each section has a **1000-row hard cap**; over-cap renders an aggregate summary + "Load full diff" button streaming 500-row pages via SignalR. Subtree-rename diffs (decision #115 bulk restructure) surface as summary only by default. |
|
||||
| `Admin/Components/IdentificationFields.razor` | New component. Renders the OPC 40010 field set **per decision #139**: `Manufacturer, Model, SerialNumber, HardwareRevision, SoftwareRevision, YearOfConstruction, AssetLocation, ManufacturerUri, DeviceManualUri`. `ProductInstanceUri / DeviceRevision / MonthOfConstruction` dropped from this phase — they need a separate decision-log widening. |
|
||||
| `OtOpcUa.Server/OpcUa/DriverNodeManager` — Equipment folder build | When an `Equipment` row has non-null Identification fields, the server adds an `Identification` sub-folder under the Equipment node containing one variable per non-null field. **ACL binding**: the sub-folder + variables inherit the `Equipment` scope's grants from Phase 6.2's trie — no new scope level added. Documented in `acl-design.md` cross-reference update. |
|
||||
|
||||
## Scope — What Does NOT Change
|
||||
|
||||
| Item | Reason |
|
||||
|------|--------|
|
||||
| Admin UI visual language | Bootstrap 5 / cookie auth / sidebar layout unchanged — consistency with ScadaLink design reference. |
|
||||
| LDAP auth flow | Already shipped in Phase 1. Phase 6.4 is additive UI only. |
|
||||
| Core abstractions / driver layer | Admin UI changes don't touch drivers. |
|
||||
| Equipment-class *template schema validation* | Still deferred (decision #112 — schemas repo not landed). We expose the Identification fields but don't validate against a template hierarchy. |
|
||||
| Drag/move to *other clusters* | Out of scope — equipment is cluster-scoped per decision #82. Cross-cluster migration is a different workflow. |
|
||||
|
||||
## Entry Gate Checklist
|
||||
|
||||
- [ ] Phase 6.2 merged (ACL grants are part of the new diff viewer sections)
|
||||
- [ ] Phase 6.3 merged (redundancy-role changes are part of the diff viewer)
|
||||
- [ ] `phase-1-configuration-and-admin-scaffold.md` §Stream E completion checklist re-read — confirm these are the remaining items
|
||||
- [ ] `admin-ui.md` re-skimmed for screen layouts
|
||||
- [ ] Existing `EquipmentTab.razor` / `UnsTab.razor` / `DraftEditor.razor` diff'd against what ships today so the edits are additive not destructive
|
||||
- [ ] Dev Galaxy available for OPC 40010 exposure smoke testing
|
||||
|
||||
## Task Breakdown
|
||||
|
||||
### Stream A — UNS drag/reorder + impact preview (5 days)
|
||||
|
||||
1. **A.1** 1000-node synthetic seed fixture. Drag-latency bench against `MudBlazor.TreeView` + `MudBlazor.DropTarget` — commit to the component if latency budget (100 ms drag-enter feedback) holds; fall back to flat-list reorder UI (Area/Line dropdowns) with loss of visual drag affordance otherwise.
|
||||
2. **A.2** `UnsImpactAnalyzer` service. Inputs: `(DraftGenerationId, MoveOperation, DraftRevisionToken)`. Outputs: `ImpactPreview { AffectedEquipmentCount, AffectedTagCount, CascadeWarnings[], DraftRevisionToken }`. Pure-function shape; testable in isolation.
|
||||
3. **A.3** Modal preview wired to `UnsImpactAnalyzer`. **Confirm** re-reads the current draft revision + compares against the preview's token; if the draft advanced (another operator saved a different edit), show a `409 Conflict / refresh-required` modal rather than silently overwriting.
|
||||
4. **A.4** Cross-cluster drop attempts: target disabled + toast "Equipment is cluster-scoped (decision #82). To move across clusters, use Export → Import on the Cluster detail page." Plus help link.
|
||||
5. **A.5** Playwright (or equivalent) smoke test: drag a line across areas, assert modal shows right counts, assert draft row reflects the move; concurrent-edit test runs two sessions + asserts the later Confirm hits the 409.
|
||||
|
||||
### Stream B — Equipment CSV import + 5-identifier search (5 days)
|
||||
|
||||
1. **B.1** `EquipmentCsvImporter`. Strict RFC 4180 parser (per decision #95). Header row validation: first line must match `# OtOpcUaCsv v1` — future versions fork parser versions. Required columns: `ZTag, MachineCode, SAPID, EquipmentId, EquipmentUuid, Name, UnsAreaName, UnsLineName`. Optional: `Manufacturer, Model, SerialNumber, HardwareRevision, SoftwareRevision, YearOfConstruction, AssetLocation, ManufacturerUri, DeviceManualUri`. Parser rejects unknown columns + blank required fields + duplicate ZTags.
|
||||
2. **B.2** `EquipmentImportBatch` + `EquipmentImportRow` staging tables (migration). Import writes preview rows to staging via chunked inserts; staging never blocks `Equipment` or `ExternalIdReservation`. Preview query reads staging + validates each row against the current `Equipment` state + `ExternalIdReservation` freshness.
|
||||
3. **B.3** `ImportPreview` UI — per-row accept/reject table. Reject reasons: "ZTag already exists in draft", "ExternalIdReservation conflict with Cluster X", "UnsLineName not found in draft UNS tree", etc. Operator reviews + clicks "Commit".
|
||||
4. **B.4** `FinaliseImportBatch` — atomic finalize. One EF transaction applies accepted rows to `Equipment` + `ExternalIdReservation`; duration bounded regardless of input size (the atomic step is a bulk-insert, not per-row row-by-row). Rollback = drop batch row via `DropImportBatch`; `Equipment` never partially mutates.
|
||||
5. **B.5** Five-identifier search. Rank SQL: exact match any identifier = score 100, prefix match = 50, LIKE-fuzzy (opt-in via `?fuzzy=true`) = 20; tie-break `published > draft` then `RowVersion DESC`. Typeahead shows which field matched via trailing badge.
|
||||
6. **B.6** Smoke tests: 100-row CSV with 10 conflicts (5 ZTag dupes, 3 reservation clashes, 2 missing UnsLines); 10k-row perf test asserting finalize txn < 30 s; concurrent import + external `ExternalIdReservation` insert test asserts retryable-conflict handling.
|
||||
|
||||
### Stream C — Diff viewer enhancements (4 days)
|
||||
|
||||
1. **C.1** Refactor `DiffViewer.razor` into a base component + section plugins. Plugins: `StructuralDiffSection` (UNS tree), `EquipmentDiffSection`, `TagDiffSection`, `AclDiffSection` (Phase 6.2), `RedundancyDiffSection` (Phase 6.3), `IdentificationDiffSection`.
|
||||
2. **C.2** Each section renders collapsed by default; counts + top-line summary always visible. **1000-row hard cap** per section — over-cap sections render aggregate summary (e.g. "237 equipment re-parented from Packaging to Assembly") with a "Load full diff" button that streams 500-row pages via SignalR.
|
||||
3. **C.3** Subtree-rename diffs (decision #115 bulk restructure) surface as summary only by default regardless of row count.
|
||||
4. **C.4** Tests: seed two generations with deliberate diffs; assert every section reports the right counts + top-line summary + hard-cap behavior.
|
||||
|
||||
### Stream D — OPC 40010 Identification exposure (3 days)
|
||||
|
||||
1. **D.1** `IdentificationFields.razor` component. Renders the **9 decision #139 fields**: `Manufacturer, Model, SerialNumber, HardwareRevision, SoftwareRevision, YearOfConstruction, AssetLocation, ManufacturerUri, DeviceManualUri`. Labelled inputs; nullable columns show empty input; required-field validation on commit only.
|
||||
2. **D.2** `DriverNodeManager` equipment-folder builder — after building the equipment node, inspect the 9 Identification columns; if any non-null, add an `Identification` sub-folder with variable-per-non-null-field. ACL binding: sub-folder + variables inherit the **same `ScopeId` as the Equipment node** (Phase 6.2's trie treats them as part of the Equipment scope — no new scope level).
|
||||
3. **D.3** Address-space smoke test via Client.CLI: browse an equipment node, assert `Identification` sub-folder present when columns are set, absent when all null, variables match the field values.
|
||||
4. **D.4** ACL integration test: a user with Equipment-level grant reads the `Identification` variables without needing a separate grant; a user without the Equipment grant gets `BadUserAccessDenied` on both the Equipment node + its Identification variables.
|
||||
|
||||
## Compliance Checks (run at exit gate)
|
||||
|
||||
- [ ] **UNS drag/move**: drag a line across areas; modal preview shows correct impacted-equipment + impacted-tag counts.
|
||||
- [ ] **Concurrent-edit safety**: two-session test — session B saves a draft edit after session A opened the preview; session A's Confirm returns `409 Conflict / refresh-required` instead of overwriting.
|
||||
- [ ] **Cross-cluster drop**: dropping equipment across cluster boundaries is disabled + shows actionable toast pointing to Export/Import workflow.
|
||||
- [ ] **1000-node tree**: drag operations on a 1000-node seed maintain < 100 ms drag-enter feedback.
|
||||
- [ ] **CSV header version**: file missing `# OtOpcUaCsv v1` first line is rejected pre-parse.
|
||||
- [ ] **CSV canonical identifier set**: columns match decision #117 (ZTag / MachineCode / SAPID / EquipmentId / EquipmentUuid); drift from the earlier draft surfaces as a test failure.
|
||||
- [ ] **Staged-import atomicity**: `FinaliseImportBatch` transaction bounded < 30 s for a 10k-row import; pre-finalize stagings visible only to the importing user; rollback via `DropImportBatch`.
|
||||
- [ ] **Concurrent import + external reservation**: concurrent test — third party inserts to `ExternalIdReservation` mid-finalize; finalize retries with conflict handling; no corruption.
|
||||
- [ ] **5-identifier search ranking**: exact matches outrank prefix matches; published outranks draft for equal scores.
|
||||
- [ ] **Diff viewer section caps**: 2000-row subtree-rename diff renders as summary only; "Load full diff" streams in pages.
|
||||
- [ ] **OPC 40010 field list match**: rendered field group matches decision #139 exactly; no extra fields.
|
||||
- [ ] **OPC 40010 exposure**: Client.CLI browse shows `Identification` sub-folder when equipment has non-null columns; absent when all null.
|
||||
- [ ] **ACL inheritance for Identification**: integration test — Equipment-grant user reads Identification; no-grant user gets `BadUserAccessDenied` on both.
|
||||
- [ ] **Visual parity reviewer**: named role (`FleetAdmin` user, not the implementation lead) compares side-by-side against `admin-ui.md` §Visual-Design reference panels; signoff artefact is a checked-in screenshot set under `docs/v2/visual-compliance/phase-6-4/`.
|
||||
|
||||
## Risks and Mitigations
|
||||
|
||||
| Risk | Likelihood | Impact | Mitigation |
|
||||
|------|:----------:|:------:|------------|
|
||||
| UNS drag-drop janky on large trees (>500 nodes) | Medium | Medium | Virtualize the tree component; default-collapse nested areas; test with a synthetic 1000-equipment seed |
|
||||
| CSV import performance on 10k-row imports | Medium | Medium | Stream-parse rather than load-into-memory; preview renders in batches of 100; commit is chunked-EF-insert with progress bar |
|
||||
| Diff viewer becomes unwieldy with many sections | Low | Medium | Each section collapsed by default; top-line summary row always shown; Phase 6.4 caps at 6 sections |
|
||||
| OPC 40010 sub-folder accidentally exposes NULL/empty identification columns as empty-string variables | Low | Low | Column null-check in the builder; drop variables whose DB value is null |
|
||||
| 5-identifier search pulls full table | Medium | Medium | Indexes on each of ZTag/SAPID/UniqueId/Alias1/Alias2; search query uses a UNION of 5 indexed lookups; falls back to LIKE only on explicit operator opt-in |
|
||||
|
||||
## Completion Checklist
|
||||
|
||||
- [ ] Stream A: `UnsImpactAnalyzer` + drag-drop tree + modal preview + Playwright smoke
|
||||
- [ ] Stream B: `EquipmentCsvImporter` + preview modal + 5-identifier search + conflict-rollback test
|
||||
- [ ] Stream C: `DiffViewer` refactor + 6 section plugins + 2-generation diff test
|
||||
- [ ] Stream D: `IdentificationFields.razor` + address-space builder change + Client.CLI browse test
|
||||
- [ ] Visual-compliance reviewer signoff
|
||||
- [ ] Full solution `dotnet test` passes; `phase-6-4-compliance.ps1` exits 0; exit-gate doc
|
||||
|
||||
## Adversarial Review — 2026-04-19 (Codex, via `codex-rescue` subagent)
|
||||
|
||||
1. **Crit · ACCEPT** — Stale UNS impact preview can overwrite concurrent draft edits. **Change**: each preview carries a `DraftRevisionToken`; `Confirm` compares against the current draft + rejects with a `409 Conflict / refresh-required` modal if any draft edit landed since the preview was generated. Stream A.3 updated.
|
||||
2. **High · ACCEPT** — CSV import atomicity is internally contradictory (single EF transaction vs. chunked inserts). **Change**: one explicit model — staged-import table (`EquipmentImportBatch { Id, CreatedAtUtc, RowsStaged, RowsAccepted, RowsRejected }`) receives rows in chunks; final `FinaliseImportBatch` is atomic over `Equipment` + `ExternalIdReservation`. Rollback is "drop the batch row" — the real Equipment table is never partially mutated.
|
||||
3. **Crit · ACCEPT** — Identifier contract rewrite mis-cites decisions. **Change**: revert to the `admin-ui.md` + decision #117 canonical set — `ZTag / MachineCode / SAPID / EquipmentId / EquipmentUuid`. CSV header follows that set verbatim. Introduce a separate decision entry for versioned CSV header shape before adding any new column; CSV header row must start with `# OtOpcUaCsv v1` so future shape changes are unambiguous.
|
||||
4. **Med · ACCEPT** — Search ordering undefined. **Change**: rank SQL — exact match on any identifier scores 100; prefix match 50; LIKE-fuzzy 20; published > draft tie-breaker; `ORDER BY score DESC, RowVersion DESC`. Typeahead shows which field matched via trailing badge.
|
||||
5. **High · ACCEPT** — HTML5 DnD on virtualized tree is aspirational. **Change**: Stream A.2 rewritten — commits to **`MudBlazor.TreeView` + `MudBlazor.DropTarget`** (already a transitive dep via the existing Admin UI). Build a 1000-node synthetic seed in A.1 + validate drag-latency budget before implementing impact preview. If MudBlazor can't hit the budget, fall back to a flat-list reorder UI with Area/Line dropdowns (loss of visual drag affordance but unblocks the feature).
|
||||
6. **Med · ACCEPT** — Collapsed-by-default doesn't handle generation-sized diffs. **Change**: each diff section has a hard row cap (1000 by default). Over-cap sections render an aggregate summary + "Load full diff" button that streams via SignalR in 500-row pages. Decision #115 subtree renames surface as a "N equipment re-parented under X → Y" summary instead of row-by-row.
|
||||
7. **High · ACCEPT** — OPC 40010 field list doesn't match decision #139. **Change**: field group realigned to `Manufacturer, Model, SerialNumber, HardwareRevision, SoftwareRevision, YearOfConstruction, AssetLocation, ManufacturerUri, DeviceManualUri`. `ProductInstanceUri / DeviceRevision / MonthOfConstruction` dropped from Phase 6.4 — they belong to a future OPC 40010 widening decision.
|
||||
8. **High · ACCEPT** — `Identification` subtree unreconciled with ACL hierarchy (Phase 6.2 6-level scope). **Change**: address-space builder creates the Identification sub-folder under the Equipment node **with the same ScopeId as Equipment** — no new scope level. ACL evaluator treats `…/Equipment/Identification/X` as inheriting the `Equipment` scope's grants. Documented in Phase 6.2's `acl-design.md` cross-reference update.
|
||||
9. **Low · ACCEPT** — Visual-review gate names nonexistent reviewer role. **Change**: rubric defined — a named "Admin UX reviewer" (role `FleetAdmin` user, not the implementation lead) compares side-by-side screenshots against the `admin-ui.md` §Visual-Design reference panels; signoff artefact is a checked-in screenshot set under `docs/v2/visual-compliance/phase-6-4/`.
|
||||
10. **Med · ACCEPT** — Cross-cluster drag/drop lacks loud failure path. **Change**: on drop across cluster boundary, disable the drop target + show a toast "Equipment is cluster-scoped (decision #82). To move across clusters, use the Export → Import workflow on the Cluster detail page." Plus a help link. Tested in Stream A.4.
|
||||
|
||||
@@ -7,100 +7,189 @@ Basic256Sha256 endpoints and alarms are observable through
|
||||
specific before the stack can fully replace the v1 deployment, in
|
||||
rough priority order.
|
||||
|
||||
## 1. Proxy-side `IHistoryProvider` for `ReadAtTime` / `ReadEvents`
|
||||
## 1. Proxy-side `IHistoryProvider` for `ReadAtTime` / `ReadEvents` — **DONE (PRs 35 + 38)**
|
||||
|
||||
**Status**: Host-side IPC shipped (PR 10 + PR 11). Proxy consumer not written.
|
||||
PR 35 extended `IHistoryProvider` with `ReadAtTimeAsync` + `ReadEventsAsync`
|
||||
(default throwing implementations so existing impls keep compiling), added the
|
||||
`HistoricalEvent` + `HistoricalEventsResult` records to `Core.Abstractions`,
|
||||
and implemented both methods in `GalaxyProxyDriver` on top of the PR 10 / PR 11
|
||||
IPC messages.
|
||||
|
||||
PR 10 added `HistoryReadAtTimeRequest/Response` on the IPC wire and
|
||||
`MxAccessGalaxyBackend.HistoryReadAtTimeAsync` delegates to
|
||||
`HistorianDataSource.ReadAtTimeAsync`. PR 11 did the same for events
|
||||
(`HistoryReadEventsRequest/Response` + `GalaxyHistoricalEvent`). The Proxy
|
||||
side (`GalaxyProxyDriver`) doesn't call those yet — `Core.Abstractions.IHistoryProvider`
|
||||
only exposes `ReadRawAsync` + `ReadProcessedAsync`.
|
||||
PR 38 wired the OPC UA HistoryRead service-handler through
|
||||
`DriverNodeManager` by overriding `CustomNodeManager2`'s four per-kind hooks —
|
||||
`HistoryReadRawModified` / `HistoryReadProcessed` / `HistoryReadAtTime` /
|
||||
`HistoryReadEvents`. Each walks `nodesToProcess`, resolves the driver-side
|
||||
full reference from `NodeId.Identifier`, dispatches to the right
|
||||
`IHistoryProvider` method, and populates the paired results + errors lists
|
||||
(both must be set — the MasterNodeManager merges them and a Good result with
|
||||
an unset error slot serializes as `BadHistoryOperationUnsupported` on the
|
||||
wire). Historized variables gain `AccessLevels.HistoryRead` so the stack
|
||||
dispatches; the driver root folder gains `EventNotifiers.HistoryRead` so
|
||||
`HistoryReadEvents` can target it.
|
||||
|
||||
**To do**:
|
||||
- Extend `IHistoryProvider` with `ReadAtTimeAsync(string, DateTime[], …)` and
|
||||
`ReadEventsAsync(string?, DateTime, DateTime, int, …)`.
|
||||
- `GalaxyProxyDriver` calls the new IPC message kinds.
|
||||
- `DriverNodeManager` wires the new capability methods onto `HistoryRead`
|
||||
`AtTime` + `Events` service handlers.
|
||||
- Integration test: OPC UA client calls `HistoryReadAtTime` / `HistoryReadEvents`,
|
||||
value flows through IPC to the Host's `HistorianDataSource`, back to the client.
|
||||
Aggregate translation uses a small `MapAggregate` helper that handles
|
||||
`Average` / `Minimum` / `Maximum` / `Total` / `Count` (the enum surface the
|
||||
driver exposes) and returns null for unsupported aggregates so the handler
|
||||
can surface `BadAggregateNotSupported`. Raw+Processed+AtTime wrap driver
|
||||
samples as `HistoryData` in an `ExtensionObject`; Events emits a
|
||||
`HistoryEvent` with the standard BaseEventType field list (EventId /
|
||||
SourceName / Message / Severity / Time / ReceiveTime) — custom
|
||||
`SelectClause` evaluation is an explicit follow-up.
|
||||
|
||||
## 2. Write-gating by role
|
||||
**Tests**:
|
||||
|
||||
**Status**: `RoleBasedIdentity.Roles` populated on the session (PR 19) but
|
||||
`DriverNodeManager.OnWriteValue` doesn't consult it.
|
||||
- `DriverNodeManagerHistoryMappingTests` — 12 unit cases pinning
|
||||
`MapAggregate`, `BuildHistoryData`, `BuildHistoryEvent`, `ToDataValue`.
|
||||
- `HistoryReadIntegrationTests` — 5 end-to-end cases drive a real OPC UA
|
||||
client (`Session.HistoryRead`) against a fake `IHistoryProvider` driver
|
||||
through the running stack. Covers raw round-trip, processed with Average
|
||||
aggregate, unsupported aggregate → `BadAggregateNotSupported`, at-time
|
||||
timestamp forwarding, and events field-list shape.
|
||||
|
||||
CLAUDE.md defines the role set: `ReadOnly` / `WriteOperate` / `WriteTune` /
|
||||
`WriteConfigure` / `AlarmAck`. Each `DriverAttributeInfo.SecurityClassification`
|
||||
maps to a required role for writes.
|
||||
**Deferred**:
|
||||
- Continuation-point plumbing via `Session.Save/RestoreHistoryContinuationPoint`.
|
||||
Driver returns null continuations today so the pass-through is fine.
|
||||
- Per-`SelectClause` evaluation in HistoryReadEvents — clients that send a
|
||||
custom field selection currently get the standard BaseEventType layout.
|
||||
|
||||
**To do**:
|
||||
- Add a `RoleRequirements` table: `SecurityClassification` → required role.
|
||||
- `OnWriteValue` reads `context.UserIdentity` → cast to `RoleBasedIdentity`
|
||||
→ check role membership before calling `IWritable.WriteAsync`. Return
|
||||
`BadUserAccessDenied` on miss.
|
||||
- Unit test against a fake `ISystemContext` with varying role sets.
|
||||
## 2. Write-gating by role — **DONE (PR 26)**
|
||||
|
||||
## 3. Admin UI client-cert trust management
|
||||
Landed in PR 26. `WriteAuthzPolicy` in `Server/Security/` maps
|
||||
`SecurityClassification` → required role (`FreeAccess` → no role required,
|
||||
`Operate`/`SecuredWrite` → `WriteOperate`, `Tune` → `WriteTune`,
|
||||
`Configure`/`VerifiedWrite` → `WriteConfigure`, `ViewOnly` → deny regardless).
|
||||
`DriverNodeManager` caches the classification per variable during discovery and
|
||||
checks the session's roles (via `IRoleBearer`) in `OnWriteValue` before calling
|
||||
`IWritable.WriteAsync`. Roles do not cascade — a session with `WriteOperate`
|
||||
can't write a `Tune` attribute unless it also carries `WriteTune`.
|
||||
|
||||
**Status**: Server side auto-accepts untrusted client certs when the
|
||||
`AutoAcceptUntrustedClientCertificates` option is true (dev default).
|
||||
Production deployments want operator-controlled trust via the Admin UI.
|
||||
See `feedback_acl_at_server_layer.md` in memory for the architectural directive
|
||||
that authz stays at the server layer and never delegates to driver-specific auth.
|
||||
|
||||
**To do**:
|
||||
- Surface the server's rejected-certificate store in the Admin UI.
|
||||
- Page to move certs between `rejected` / `trusted`.
|
||||
- Flip `AutoAcceptUntrustedClientCertificates` to false once Admin UI is the
|
||||
trust gate.
|
||||
## 3. Admin UI client-cert trust management — **DONE (PR 28)**
|
||||
|
||||
## 4. Live-LDAP integration test
|
||||
PR 28 shipped `/certificates` in the Admin UI. `CertTrustService` reads the OPC
|
||||
UA server's PKI store root (`OpcUaServerOptions.PkiStoreRoot` — default
|
||||
`%ProgramData%\OtOpcUa\pki`) and lists rejected + trusted certs by parsing the
|
||||
`.der` files directly, so it has no `Opc.Ua` dependency and runs on any
|
||||
Admin host that can reach the shared PKI directory.
|
||||
|
||||
**Status**: PR 19 unit-tested the auth-flow shape; the live bind path is
|
||||
exercised only by the pre-existing `Admin.Tests/LdapLiveBindTests.cs` which
|
||||
uses the same Novell library against a running GLAuth at `localhost:3893`.
|
||||
Operator actions: Trust (moves `rejected/certs/*.der` → `trusted/certs/*.der`),
|
||||
Delete rejected, Revoke trust. The OPC UA stack re-reads the trusted store on
|
||||
each new client handshake, so no explicit reload signal is needed —
|
||||
operators retry the rejected client's connection after trusting.
|
||||
|
||||
**To do**:
|
||||
- Add `OpcUaServerIntegrationTests.Valid_username_authenticates_against_live_ldap`
|
||||
with the same skip-when-unreachable guard.
|
||||
- Assert `session.Identity` on the server side carries the expected role
|
||||
after bind — requires exposing a test hook or reading identity from a
|
||||
new `IHostConnectivityProbe`-style "whoami" variable in the address space.
|
||||
Deferred: flipping `AutoAcceptUntrustedClientCertificates` to `false` as the
|
||||
deployment default. That's a production-hardening config change, not a code
|
||||
gap — the Admin UI is now ready to be the trust gate.
|
||||
|
||||
## 5. Full Galaxy live-service smoke test against the merged v2 stack
|
||||
## 4. Live-LDAP integration test — **DONE (PR 31)**
|
||||
|
||||
**Status**: Individual pieces have live smoke tests (PR 5 MXAccess, PR 13
|
||||
probe manager, PR 14 alarm tracker), but the full loop — OPC UA client →
|
||||
`OtOpcUaServer` → `GalaxyProxyDriver` (in-process) → named-pipe to
|
||||
Galaxy.Host subprocess → live MXAccess runtime → real Galaxy objects — has
|
||||
no single end-to-end smoke test.
|
||||
PR 31 shipped `Server.Tests/LdapUserAuthenticatorLiveTests.cs` — 6 live-bind
|
||||
tests against the dev GLAuth instance at `localhost:3893`, skipped cleanly
|
||||
when the port is unreachable. Covers: valid bind, wrong password, unknown
|
||||
user, empty credentials, single-group → WriteOperate mapping, multi-group
|
||||
admin user surfacing all mapped roles.
|
||||
|
||||
**To do**:
|
||||
- Test that spawns the full topology, discovers a deployed Galaxy object,
|
||||
subscribes to one of its attributes, writes a value back, and asserts the
|
||||
write round-tripped through MXAccess. Skip when ArchestrA isn't running.
|
||||
Also added `UserNameAttribute` to `LdapOptions` (default `uid` for RFC 2307
|
||||
compat) so Active Directory deployments can configure `sAMAccountName` /
|
||||
`userPrincipalName` without code changes. `LdapUserAuthenticatorAdCompatTests`
|
||||
(5 unit guards) pins the AD-shape DN parsing + filter escape behaviors. See
|
||||
`docs/security.md` §"Active Directory configuration" for the AD appsettings
|
||||
snippet.
|
||||
|
||||
## 6. Second driver instance on the same server
|
||||
Deferred: asserting `session.Identity` end-to-end on the server side (i.e.
|
||||
drive a full OPC UA session with username/password, then read an
|
||||
`IHostConnectivityProbe`-style "whoami" node to verify the role surfaced).
|
||||
That needs a test-only address-space node and is a separate PR.
|
||||
|
||||
**Status**: `DriverHost.RegisterAsync` supports multiple drivers; the OPC UA
|
||||
server creates one `DriverNodeManager` per driver and isolates their
|
||||
subtrees under distinct namespace URIs. Not proven with two active
|
||||
`GalaxyProxyDriver` instances pointing at different Galaxies.
|
||||
## 5. Full Galaxy live-service smoke test against the merged v2 stack — **IN PROGRESS (PRs 36 + 37)**
|
||||
|
||||
**To do**:
|
||||
- Integration test that registers two driver instances, each with a distinct
|
||||
`DriverInstanceId` + endpoint in its own session, asserts nodes from both
|
||||
appear under the correct subtrees, alarm events land on the correct
|
||||
instance's condition nodes.
|
||||
PR 36 shipped the prerequisites helper (`AvevaPrerequisites`) that probes
|
||||
every dependency a live smoke test needs and produces actionable skip
|
||||
messages.
|
||||
|
||||
## 7. Host-status per-AppEngine granularity → Admin UI dashboard
|
||||
PR 37 shipped the live-stack smoke test project structure:
|
||||
`tests/Driver.Galaxy.Proxy.Tests/LiveStack/` with `LiveStackFixture` (connects
|
||||
to the *already-running* `OtOpcUaGalaxyHost` Windows service via named pipe;
|
||||
never spawns the Host process) and `LiveStackSmokeTests` covering:
|
||||
|
||||
**Status**: PR 13 ships per-platform/per-AppEngine `ScanState` probing; PR 17
|
||||
surfaces the resulting `OnHostStatusChanged` events through OPC UA. Admin
|
||||
UI doesn't render a per-host dashboard yet.
|
||||
- Fixture initializes successfully (IPC handshake succeeds end-to-end).
|
||||
- Driver reports `DriverState.Healthy` post-handshake.
|
||||
- `DiscoverAsync` returns at least one variable from the live Galaxy.
|
||||
- `GetHostStatuses` reports at least one Platform/AppEngine host.
|
||||
- `ReadAsync` on a discovered variable round-trips through
|
||||
Proxy → Host pipe → MXAccess → back without a BadInternalError.
|
||||
|
||||
**To do**:
|
||||
- SignalR hub push of `HostStatusChangedEventArgs` to the Admin UI.
|
||||
- Dashboard page showing each tracked host, current state, last transition
|
||||
time, failure count.
|
||||
Shared secret + pipe name resolve from `OTOPCUA_GALAXY_SECRET` /
|
||||
`OTOPCUA_GALAXY_PIPE` env vars, falling back to reading the service's
|
||||
registry-stored Environment values (requires elevated test host).
|
||||
|
||||
**PR 40** added the write + subscribe facts targeting
|
||||
`DelmiaReceiver_001.TestAttribute` (the writable Boolean UDA the dev Galaxy
|
||||
ships under TestMachine_001) — write-then-read with a 5s scan-window poll +
|
||||
restore-on-finally, and subscribe-then-write asserting both an initial-value
|
||||
OnDataChange and a post-write OnDataChange. PR 39 added the elevated-shell
|
||||
short-circuit so a developer running from an admin window gets an actionable
|
||||
skip instead of `UnauthorizedAccessException`.
|
||||
|
||||
**Run the live tests** (from a NORMAL non-admin PowerShell):
|
||||
|
||||
```powershell
|
||||
$env:OTOPCUA_GALAXY_SECRET = Get-Content C:\Users\dohertj2\Desktop\lmxopcua\.local\galaxy-host-secret.txt
|
||||
cd C:\Users\dohertj2\Desktop\lmxopcua
|
||||
dotnet test tests\ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Proxy.Tests --filter "FullyQualifiedName~LiveStackSmokeTests"
|
||||
```
|
||||
|
||||
Expected: 7/7 pass against the running `OtOpcUaGalaxyHost` service.
|
||||
|
||||
**Remaining for #5 in production-grade form**:
|
||||
- Confirm the suite passes from a non-elevated shell (operator action).
|
||||
- Add similar facts for an alarm-source attribute once `TestMachine_001` (or
|
||||
a sibling) carries a deployed alarm condition — the current dev Galaxy's
|
||||
TestAttribute isn't alarm-flagged.
|
||||
|
||||
## 6. Second driver instance on the same server — **DONE (PR 32)**
|
||||
|
||||
`Server.Tests/MultipleDriverInstancesIntegrationTests.cs` registers two
|
||||
drivers with distinct `DriverInstanceId`s on one `DriverHost`, spins up the
|
||||
full OPC UA server, and asserts three behaviors: (1) each driver's namespace
|
||||
URI (`urn:OtOpcUa:{id}`) resolves to a distinct index in the client's
|
||||
NamespaceUris, (2) browsing one subtree returns that driver's folder and
|
||||
does NOT leak the other driver's folder, (3) reads route to the correct
|
||||
driver — the alpha instance returns 42 while beta returns 99, so a misroute
|
||||
would surface at the assertion layer.
|
||||
|
||||
Deferred: the alarm-event multi-driver parity case (two drivers each raising
|
||||
a `GalaxyAlarmEvent`, assert each condition lands on its owning instance's
|
||||
condition node). Alarm tracking already has its own integration test
|
||||
(`AlarmSubscription*`); the multi-driver alarm case would need a stub
|
||||
`IAlarmSource` that's worth its own focused PR.
|
||||
|
||||
## 7. Host-status per-AppEngine granularity → Admin UI dashboard — **DONE (PRs 33 + 34)**
|
||||
|
||||
**PR 33** landed the data layer: `DriverHostStatus` entity + migration with
|
||||
composite key `(NodeId, DriverInstanceId, HostName)` and two query-supporting
|
||||
indexes (per-cluster drill-down on `NodeId`, stale-row detection on
|
||||
`LastSeenUtc`).
|
||||
|
||||
**PR 34** wired the publisher + consumer. `HostStatusPublisher` is a
|
||||
`BackgroundService` in the Server process that walks every registered
|
||||
`IHostConnectivityProbe`-capable driver every 10s, calls
|
||||
`GetHostStatuses()`, and upserts rows (`LastSeenUtc` advances each tick;
|
||||
`State` + `StateChangedUtc` update on transitions). Admin UI `/hosts` page
|
||||
groups by cluster, shows four summary cards (Hosts / Running / Stale /
|
||||
Faulted), and flags rows whose `LastSeenUtc` is older than 30s as Stale so
|
||||
operators see crashed Servers without waiting for a state change.
|
||||
|
||||
Deferred as follow-ups:
|
||||
|
||||
- Event-driven push (subscribe to `OnHostStatusChanged` per driver for
|
||||
sub-heartbeat latency). Adds DriverHost lifecycle-event plumbing;
|
||||
10s polling is fine for operator-scale use.
|
||||
- Failure-count column — needs the publisher to track a transition history
|
||||
per host, not just current-state.
|
||||
- SignalR fan-out to the Admin page (currently the page polls the DB, not
|
||||
a hub). The DB-polled version is fine at current cadence but a hub push
|
||||
would eliminate the 10s race where a new row sits in the DB before the
|
||||
Admin page notices.
|
||||
|
||||
451
docs/v2/mitsubishi.md
Normal file
451
docs/v2/mitsubishi.md
Normal file
@@ -0,0 +1,451 @@
|
||||
# Mitsubishi Electric MELSEC — Modbus TCP quirks
|
||||
|
||||
Mitsubishi's MELSEC family speaks Modbus TCP through a patchwork of add-on modules
|
||||
and built-in Ethernet ports, not a single unified stack. The module names are
|
||||
confusingly similar (`QJ71MB91` is *serial* RTU, `QJ71MT91` is the TCP/IP module
|
||||
[9]; `LJ71MT91` is the L-series equivalent; `RJ71EN71` is the iQ-R Ethernet module
|
||||
with a MODBUS/TCP *slave* mode bolted on [8]; `FX3U-ENET`, `FX3U-ENET-P502`,
|
||||
`FX3U-ENET-ADP`, `FX3GE` built-in, and `FX5U` built-in are all different code
|
||||
paths) — and every one of the categories below has at least one trap a textbook
|
||||
Modbus client gets wrong: hex-numbered X/Y devices colliding with decimal Modbus
|
||||
addresses, a user-defined "device assignment" parameter block that means *no two
|
||||
sites are identical*, CDAB-vs-ABCD word order driven by how the ladder built the
|
||||
32-bit value, sub-spec FC16 caps on the older QJ71MT91, and an FX3U port-502
|
||||
licensing split that makes `FX3U-ENET` and `FX3U-ENET-P502` different SKUs.
|
||||
This document catalogues each quirk, cites primary sources, and names the
|
||||
ModbusPal integration test we'd write for it (convention from
|
||||
`docs/v2/modbus-test-plan.md`: `Mitsubishi_<model>_<behavior>`).
|
||||
|
||||
## Models and server/client capability
|
||||
|
||||
| Model | Family | Modbus TCP server | Modbus TCP client | Source |
|
||||
|------------------------|----------|-------------------|-------------------|--------|
|
||||
| `QJ71MT91` | MELSEC-Q | Yes (slave) | Yes (master) | [9] |
|
||||
| `QJ71MB91` | MELSEC-Q | **Serial only** — RS-232/422/485 RTU, *not TCP* | — | [1][3] |
|
||||
| `LJ71MT91` | MELSEC-L | Yes (slave) | Yes (master) | [10] |
|
||||
| `RJ71EN71` / `RnENCPU` | MELSEC iQ-R | Yes (slave) | Yes (master) | [8] |
|
||||
| `RJ71C24` / `RJ71C24-R2` | MELSEC iQ-R | RTU (serial) | RTU (serial) | [13] |
|
||||
| iQ-R built-in Ethernet | CPU | Yes (slave) | Yes (master) | [7] |
|
||||
| iQ-F `FX5U` built-in Ethernet | CPU | Yes, firmware ≥ 1.060 [11] | Yes | [7][11][12] |
|
||||
| `FX3U-ENET` | FX3U bolt-on | Yes (slave), but **not on port 502** [5] | Yes | [4][5] |
|
||||
| `FX3U-ENET-P502` | FX3U bolt-on | Yes (slave), port 502 enabled | Yes | [5] |
|
||||
| `FX3U-ENET-ADP` | FX3U adapter | **No MODBUS** [5] | No MODBUS | [5] |
|
||||
| `FX3GE` built-in | FX3GE CPU | No MODBUS (needs ENET module) [6] | No | [6] |
|
||||
| `FX3G` + `FX3U-ENET` | FX3G | Yes via ENET module | Yes | [6] |
|
||||
|
||||
- A common integration mistake is to buy `FX3U-ENET-ADP` expecting MODBUS —
|
||||
that adapter speaks only MC protocol / SLMP. Our driver should surface a clear
|
||||
capability error, not "connection refused", when the operator's device tag
|
||||
says `FX3U-ENET-ADP` [5].
|
||||
- Older forum threads assert the FX5U is "client only" [12] — that was true on
|
||||
firmware ≤ 1.040. Firmware 1.060 and later ship the parameter-driven MODBUS
|
||||
TCP server built-in and need no function blocks [11].
|
||||
|
||||
## Modbus device assignment (the parameter block)
|
||||
|
||||
Unlike a DL260 where the CPU exposes a *fixed* V-memory-to-Modbus mapping, every
|
||||
MELSEC MODBUS-TCP module exposes a **Modbus Device Assignment Parameter** block
|
||||
that the engineer configures in GX Works2 / GX Configurator-MB / GX Works3.
|
||||
Each of the four Modbus tables (Coil, Input, Input Register, Holding Register)
|
||||
can be split into up to 16 independent "assignment" entries, each binding a
|
||||
contiguous Modbus address range to a MELSEC device head (`M0`, `D0`, `X0`,
|
||||
`Y0`, `B0`, `W0`, `SM0`, `SD0`, `R0`, etc.) and a point count [3][7][8][9].
|
||||
|
||||
- **There is no canonical "MELSEC Modbus mapping"**. Two sites running the same
|
||||
QJ71MT91 module can expose completely different Modbus layouts. Our driver
|
||||
must treat the mapping as site-data (config-file-driven), not as a device
|
||||
profile constant.
|
||||
- **Default values do exist** — both GX Configurator-MB (for Q/L series) and
|
||||
GX Works3 (for iQ-R / iQ-F / FX5) ship a "dedicated pattern" default that is
|
||||
applied when the engineer does not override the assignment. Per the FX5
|
||||
MODBUS Communication manual (JY997D56101) and the QJ71MT91 manual, the FX5
|
||||
dedicated default is [3][7][11]:
|
||||
|
||||
| Modbus table | Modbus range (0-based) | MELSEC device | Head |
|
||||
|--------------------|------------------------|---------------|------|
|
||||
| Coil (FC01/05/15) | 0 – 7679 | M | M0 |
|
||||
| Coil | 8192 – 8959 | Y | Y0 |
|
||||
| Input (FC02) | 0 – 7679 | M | M0 |
|
||||
| Input | 8192 – 8959 | X | X0 |
|
||||
| Input Register (FC04) | 0 – 6143 | D | D0 |
|
||||
| Holding Register (FC03/06/16) | 0 – 6143 | D | D0 |
|
||||
|
||||
This matches the widely circulated "FC03 @ 0 = D0" convention that shows up
|
||||
in Ubidots / Ignition / AdvancedHMI integration guides [6][12].
|
||||
|
||||
- **X/Y in the default mapping occupy a second, non-zero Modbus range** (8192+
|
||||
on FX5; similar on Q/L/iQ-R). Driver users who expect "X0 = coil 0" will be
|
||||
reading M0 instead. Document this clearly.
|
||||
- **Assignment-range collisions silently disable the slave.** The QJ71MT91
|
||||
manual states explicitly that if any two of assignments 1-16 duplicate the
|
||||
head Modbus device number, the slave function is inactive with no clear
|
||||
error — the module just won't respond [9]. The driver probe will look like a
|
||||
simple timeout; the site engineer has to open GX Configurator-MB to diagnose.
|
||||
|
||||
Test names:
|
||||
`Mitsubishi_FX5U_default_mapping_coil_0_is_M0`,
|
||||
`Mitsubishi_FX5U_default_mapping_holding_0_is_D0`,
|
||||
`Mitsubishi_QJ71MT91_duplicate_assignment_head_disables_slave`.
|
||||
|
||||
## X/Y addressing — hex on MELSEC, decimal on Modbus
|
||||
|
||||
**MELSEC X (input) and Y (output) device numbers are hexadecimal on Q / L /
|
||||
iQ-R** and **octal** on FX / iQ-F (with a GX Works3 toggle) [14][15].
|
||||
|
||||
- On a Q CPU, `X20` means decimal **32**, not 20. On an FX5U in default (octal)
|
||||
mode, `X20` means decimal **16**. GX Works3 exposes a project-level option to
|
||||
display FX5U X/Y in hex to match Q/L/iQ-R convention — the same physical
|
||||
input is then called `X10` [14].
|
||||
- The Modbus Device Assignment Parameter block takes the *head device* as a
|
||||
MELSEC-native number, which is interpreted in the CPU's native base
|
||||
(hex for Q/L/iQ-R, octal for FX/iQ-F). After that, **Modbus offsets from
|
||||
the head are plain decimal** — the module does not apply a second hex
|
||||
conversion [3][9].
|
||||
- Example (QJ71MT91 on a Q CPU): assignment "Coil 0 = X0, 512 points" exposes
|
||||
physical `X0` through `X1FF` (hex) as coils 0-511. A client reading coil 32
|
||||
gets the bit `X20` (hex) — i.e. the 33rd input, not the value at "input 20"
|
||||
that the operator wrote on the wiring diagram in decimal.
|
||||
- **Driver bug source**: if the operator's tag configuration says "read X20" and
|
||||
the driver helpfully converts "20" to decimal 20 → coil offset 20, the
|
||||
returned bit is actually `X14` (hex) — off by twelve. Our config layer must
|
||||
preserve the MELSEC-native base that the site engineer sees in GX Works.
|
||||
- Timers/counters (`T`, `C`, `ST`) are always decimal in MELSEC notation.
|
||||
Internal relays (`M`, `B`, `L`), data registers (`D`, `W`, `R`, `ZR`),
|
||||
and special relays/registers (`SM`, `SD`) also decimal. **Only `X` and `Y`
|
||||
(and on Q/L/iQ-R, `B` link relays and `W` link registers) use hex**, and
|
||||
the X/Y decision is itself family-dependent [14][15].
|
||||
|
||||
Test names:
|
||||
`Mitsubishi_Q_X_address_is_hex_X20_equals_coil_offset_32`,
|
||||
`Mitsubishi_FX5U_X_address_is_octal_X20_equals_coil_offset_16`,
|
||||
`Mitsubishi_W_link_register_is_hex_W10_equals_holding_offset_16`.
|
||||
|
||||
## Word order for 32-bit values
|
||||
|
||||
MELSEC stores 32-bit ladder values (`DINT`, `DWORD`, `REAL` / single-precision
|
||||
float) across **two consecutive D-registers, low word first** — i.e., `CDAB`
|
||||
when viewed as a Modbus register pair [2][6].
|
||||
|
||||
```
|
||||
D100 (low word) : 0xCC 0xDD (big-endian bytes within the word)
|
||||
D101 (high word) : 0xAA 0xBB
|
||||
```
|
||||
|
||||
A Modbus master reading D100/D101 as a `float` with default (ABCD) word order
|
||||
gets garbage. Ignition's built-in Modbus driver notes Mitsubishi as a "CDAB
|
||||
device" specifically for this reason [2].
|
||||
|
||||
- **Q / L / iQ-R / iQ-F all agree** — this is a CPU-level convention, not a
|
||||
module choice. Both the QJ71MT91 manual and the FX5 MODBUS Communication
|
||||
manual describe 32-bit access by "reading the lower 16 bits from the start
|
||||
address and the upper 16 bits from start+1" [6][11].
|
||||
- **Byte order within each register is big-endian** (Modbus standard). The
|
||||
module does not byte-swap.
|
||||
- **Configurable?** The MODBUS modules themselves do **not** expose a word-
|
||||
order toggle; the behavior is fixed to how the CPU laid out the value in the
|
||||
two D-registers. If the ladder programmer used an `SWAP` instruction or a
|
||||
union-style assignment, the word order can be whatever they made it — but
|
||||
for values produced by the standard `D→DBL` and `FLT`/`FLT2` instructions
|
||||
it is always CDAB [2].
|
||||
- **FX5U quirk**: the FX5 MODBUS Communication manual tells the programmer to
|
||||
use the `SWAP` instruction *if* the remote Modbus peer requires
|
||||
little-endian *byte* ordering (BADC) [11]. This is only relevant when the
|
||||
FX5U is the Modbus *client*, but it confirms the FX5U's native wire layout
|
||||
is big-endian-byte / little-endian-word (CDAB) on the server side too.
|
||||
- **Rumoured exception**: a handful of MrPLC forum threads report iQ-R
|
||||
RJ71EN71 firmware < 1.05 returning DWORDs in `ABCD` order when accessed via
|
||||
the built-in Ethernet port's MODBUS slave [8]. _Unconfirmed_; treat as a
|
||||
per-site test.
|
||||
|
||||
Test names:
|
||||
`Mitsubishi_Float32_word_order_is_CDAB`,
|
||||
`Mitsubishi_Int32_word_order_is_CDAB`,
|
||||
`Mitsubishi_FX5U_SWAP_instruction_changes_byte_order_not_word_order`.
|
||||
|
||||
## BCD vs binary encoding
|
||||
|
||||
**MELSEC stores integer values in D-registers as plain binary two's-complement**,
|
||||
not BCD [16]. This is the opposite of AutomationDirect DirectLOGIC, where
|
||||
V-memory defaults to BCD and the ladder must explicitly request binary.
|
||||
|
||||
- A ladder `MOV K1234 D100` stores `0x04D2` (1234 decimal) in D100, not
|
||||
`0x1234`. The Modbus master reads `0x04D2` and decodes it as an integer
|
||||
directly — no BCD conversion needed [16].
|
||||
- **Timer / counter current values** (`T0` current value, `C0` count) are
|
||||
stored in binary as word devices on Q/L/iQ-R/iQ-F. The ladder preset
|
||||
(`K...`) is also binary [16][17].
|
||||
- **Timer / counter preset `K` operand in FX3U / earlier FX**: also binary when
|
||||
loaded from a D-register or a `K` constant. The older A-series CPUs had BCD
|
||||
presets on some timer types, but MELSEC-Q, L, iQ-R, iQ-F, and FX3U all use
|
||||
binary presets by default [17].
|
||||
- The FX3U programming manual dedicates `FNC 18 BCD` and `FNC 19 BIN` to
|
||||
explicit conversion — their existence confirms that anything in D-registers
|
||||
that came from a `BCD` instruction output is BCD, but nothing is BCD by
|
||||
default [17].
|
||||
- **7-segment display registers** are a common site-specific exception — many
|
||||
ladders pack `BCD D100` into a D-register so the operator panel can drive
|
||||
a display directly. Our driver should not assume; expose a per-tag
|
||||
"encoding = binary | BCD" knob.
|
||||
|
||||
Test names:
|
||||
`Mitsubishi_D_register_stores_binary_not_BCD`,
|
||||
`Mitsubishi_FX3U_timer_current_value_is_binary`.
|
||||
|
||||
## Max registers per request
|
||||
|
||||
From the FX5 MODBUS Communication manual Chapter 11 [11]:
|
||||
|
||||
| FC | Name | FX5U (built-in) | QJ71MT91 | iQ-R (RJ71EN71 / built-in) | FX3U-ENET |
|
||||
|----|----------------------------|-----------------|--------------|-----------------------------|-----------|
|
||||
| 01 | Read Coils | 1-2000 | 1-2000 [9] | 1-2000 [8] | 1-2000 |
|
||||
| 02 | Read Discrete Inputs | 1-2000 | 1-2000 | 1-2000 | 1-2000 |
|
||||
| 03 | Read Holding Registers | **1-125** | 1-125 [9] | 1-125 [8] | 1-125 |
|
||||
| 04 | Read Input Registers | 1-125 | 1-125 | 1-125 | 1-125 |
|
||||
| 05 | Write Single Coil | 1 | 1 | 1 | 1 |
|
||||
| 06 | Write Single Register | 1 | 1 | 1 | 1 |
|
||||
| 0F | Write Multiple Coils | 1-1968 | 1-1968 | 1-1968 | 1-1968 |
|
||||
| 10 | Write Multiple Registers | **1-123** | 1-123 | 1-123 | 1-123 |
|
||||
| 16 | Mask Write Register | 1 | not supported | 1 | not supported |
|
||||
| 17 | Read/Write Multiple Regs | R:1-125, W:1-121 | not supported | R:1-125, W:1-121 | not supported |
|
||||
|
||||
- **The FX5U / iQ-R native-port limits match the Modbus spec**: 125 for FC03/04,
|
||||
123 for FC16 [11]. No sub-spec caps like DL260's 100-register ceiling.
|
||||
- **QJ71MT91 does not support FC16 (0x16, Mask Write Register) or FC17
|
||||
(0x17, Read/Write Multiple)** — requesting them returns exception `01`
|
||||
Illegal Function [9]. FX5U and iQ-R *do* support both.
|
||||
- **QJ71MT91 device size**: 64k points (65,536) for each of Coil / Input /
|
||||
Input Register / Holding Register, plus up to 4086k points for Extended
|
||||
File Register via a secondary assignment range [9].
|
||||
- **FX3U-ENET / -P502 function code list is a strict subset** of the common
|
||||
eight (FC01/02/03/04/05/06/0F/10). FC16 and FC17 not supported [4].
|
||||
|
||||
Test names:
|
||||
`Mitsubishi_FX5U_FC03_126_registers_returns_IllegalDataValue`,
|
||||
`Mitsubishi_FX5U_FC16_124_registers_returns_IllegalDataValue`,
|
||||
`Mitsubishi_QJ71MT91_FC16_MaskWrite_returns_IllegalFunction`,
|
||||
`Mitsubishi_QJ71MT91_FC23_ReadWrite_returns_IllegalFunction`.
|
||||
|
||||
## Exception codes
|
||||
|
||||
MELSEC MODBUS modules return **only the standard Modbus exception codes 01-04**;
|
||||
no proprietary exception codes are exposed on the wire [8][9][11]. Module-
|
||||
internal diagnostics (buffer-memory error codes like `7380H`) are logged but
|
||||
not returned as Modbus exceptions.
|
||||
|
||||
| Code | Name | MELSEC trigger |
|
||||
|------|----------------------|---------------------------------------------------------|
|
||||
| 01 | Illegal Function | FC17 or FC16 on QJ71MT91/FX3U; FC08 (Diagnostics); FC43 |
|
||||
| 02 | Illegal Data Address | Modbus address outside any assignment range |
|
||||
| 03 | Illegal Data Value | Quantity out of per-FC range (see table above); odd coil-byte count |
|
||||
| 04 | Server Device Failure | See below |
|
||||
|
||||
- **04 (Server Failure) triggers on MELSEC**:
|
||||
- CPU in STOP or PAUSE during a write to an assignment whose "Access from
|
||||
External Device" permission is set to "Disabled in STOP" [9][11].
|
||||
*With the default "always enabled" setting the write succeeds in STOP
|
||||
mode* — another common trap.
|
||||
- CPU errors (parameter error, watchdog) during any access.
|
||||
- Assignment points to a device range that is not configured (e.g. write
|
||||
to `D16384` when CPU D-device size is 12288).
|
||||
- **Write to a "System Area" device** (e.g., `SD` special registers that are
|
||||
CPU-reserved read-only) returns `04`, not `02`, on QJ71MT91 and iQ-R — the
|
||||
assignment is valid, the device exists, but the CPU rejects the write [8][9].
|
||||
- **FX3U-ENET / -P502** returns `04` on any write attempt while the CPU is in
|
||||
STOP, regardless of permission settings — the older firmware does not
|
||||
implement the "Access from External Device" granularity that Q/L/iQ-R/iQ-F
|
||||
expose [4].
|
||||
- **No rumour of proprietary codes 05-0B** from MELSEC; operators sometimes
|
||||
report "exception 0A" but those traces all came from a third-party gateway
|
||||
sitting between the master and the MELSEC module.
|
||||
|
||||
Test names:
|
||||
`Mitsubishi_QJ71MT91_STOP_mode_write_with_Disabled_permission_returns_ServerFailure`,
|
||||
`Mitsubishi_QJ71MT91_STOP_mode_write_with_default_permission_succeeds`,
|
||||
`Mitsubishi_SD_system_register_write_returns_ServerFailure`,
|
||||
`Mitsubishi_FX3U_STOP_mode_write_always_returns_ServerFailure`.
|
||||
|
||||
## Connection behavior
|
||||
|
||||
Max simultaneous Modbus TCP clients, per module [7][8][9][11]:
|
||||
|
||||
| Model | Max TCP connections | Port 502 | Keepalive | Source |
|
||||
|----------------------|---------------------|----------|-----------|--------|
|
||||
| `QJ71MT91` | 16 (shared with master role) | Yes | No | [9] |
|
||||
| `LJ71MT91` | 16 | Yes | No | [10] |
|
||||
| iQ-R built-in / `RJ71EN71` | 16 | Yes | Configurable (KeepAlive = ON in parameter) | [8] |
|
||||
| iQ-F `FX5U` built-in | 8 | Yes | Configurable | [7][11] |
|
||||
| `FX3U-ENET` | 8 TCP, but **not port 502** | No (port < 1024 blocked) | No | [4][5] |
|
||||
| `FX3U-ENET-P502` | 8, port 502 enabled | Yes | No | [5] |
|
||||
|
||||
- **QJ71MT91's 16 is total connections shared between slave-listen and
|
||||
master-initiated sockets** [9]. A site that uses the same module as both
|
||||
master to downstream VFDs and slave to upstream SCADA splits the 16 pool.
|
||||
- **FX3U-ENET port-502 gotcha**: if the engineer loads a configuration with
|
||||
port 502 into a non-P502 ENET module, GX Works shows the download as
|
||||
successful; on next power cycle the module enters error state and the
|
||||
MODBUS listener never starts. This is documented on third-party FX3G
|
||||
integration guides [6].
|
||||
- **CPU STOP → RUN transition**: does **not** drop Modbus connections on any
|
||||
MELSEC family. Existing sockets stay open; outstanding requests during the
|
||||
transition may see exception 04 for a few scans but then resume [8][9].
|
||||
- **CPU reset (power cycle or `SM1255` forced reset)** drops all Modbus
|
||||
connections and the module re-listens after typically 5-10 seconds.
|
||||
- **Idle timeout**: QJ71MT91 and iQ-R have a per-connection "Alive-Check"
|
||||
(idle timer) parameter, default 0 (disabled). If enabled, default 10 s
|
||||
probe interval, 3 retries before close [8][9]. FX5U similar defaults.
|
||||
- **Keep-alive (TCP-level)**: only iQ-R / iQ-F expose a TCP keep-alive option
|
||||
(parameter "KeepAlive" in the Ethernet settings); QJ71MT91 and FX3U-ENET
|
||||
do not — so NAT/firewall idle drops require driver-side pinging.
|
||||
|
||||
Test names:
|
||||
`Mitsubishi_QJ71MT91_17th_connection_refused`,
|
||||
`Mitsubishi_FX5U_9th_connection_refused`,
|
||||
`Mitsubishi_STOP_to_RUN_transition_preserves_socket`,
|
||||
`Mitsubishi_CPU_reset_closes_all_sockets`.
|
||||
|
||||
## Behavioral oddities
|
||||
|
||||
- **Transaction ID echo**: QJ71MT91 and iQ-R reliably echo the MBAP TxId on
|
||||
every response across firmware revisions; no reports of TxId drops under
|
||||
load [8][9]. FX3U-ENET has an older, less-tested TCP stack; at least one
|
||||
MrPLC thread reports out-of-order TxId echoes under heavy polling on
|
||||
firmware < 1.14 [4]. _Unconfirmed_ on current firmware.
|
||||
- **Per-connection request serialization**: all MELSEC slaves serialize
|
||||
requests within a single TCP connection — a new request is not processed
|
||||
until the prior response has been sent. Pipelining multiple requests on one
|
||||
socket causes the module to queue them in buffer memory and respond in
|
||||
order, but **the queue depth is 1** on QJ71MT91 (a second in-flight request
|
||||
is held on the TCP receive buffer, not queued) [9]. Driver should treat
|
||||
Mitsubishi slaves as strictly single-flight per socket.
|
||||
- **Partial-frame handling**: QJ71MT91 and iQ-R close the socket on malformed
|
||||
MBAP length fields. FX5U resynchronises at the next valid MBAP header
|
||||
within 100 ms but will emit an error to `SD` diagnostics [11]. Driver must
|
||||
reconnect on half-close and replay.
|
||||
- **FX3U UDP vs TCP**: `FX3U-ENET` supports both UDP and TCP MODBUS transports;
|
||||
UDP is lossy and reorders under load. Default is TCP. Some legacy SCADA
|
||||
configurations pinned the module to UDP for multicast discovery — do not
|
||||
select UDP unless the site requires it [4].
|
||||
- **Known firmware-revision variants**:
|
||||
- QJ71MT91 ≤ firmware 10052000000 (year-month format): FC15 with coil
|
||||
count that forces byte-count to an odd value silently truncates the
|
||||
last coil. Fixed in later revisions [9]. _Operator-reported_.
|
||||
- FX5U firmware < 1.060: no native MODBUS TCP server — only accessible via
|
||||
a predefined-protocol function block hack. Firmware ≥ 1.060 ships
|
||||
parameter-based server. Our capability probe should read `SD203`
|
||||
(firmware version) and flag < 1.060 as unsupported for server mode [11][12].
|
||||
- iQ-R RJ71EN71 early firmware: possible ABCD word order (rumoured,
|
||||
unconfirmed) [8].
|
||||
- **SD (special-register) reads during assignment-parameter load**: while
|
||||
the CPU is loading a new MODBUS device assignment parameter (~1-2 s), the
|
||||
slave returns exception 04 Server Failure on every request. Happens after
|
||||
a parameter write from GX Configurator-MB [9].
|
||||
- **iQ-R "Station-based block transfer" collision**: if the RJ71EN71 is also
|
||||
running CC-Link IE Control on the same module, a MODBUS/TCP request that
|
||||
arrives during a CCIE cyclic period is delayed to the next scan — visible
|
||||
as jittery response time, not a failure [8].
|
||||
|
||||
Test names:
|
||||
`Mitsubishi_QJ71MT91_single_flight_per_socket`,
|
||||
`Mitsubishi_FX5U_malformed_MBAP_resync_within_100ms`,
|
||||
`Mitsubishi_FX3U_TxId_preserved_across_burst`,
|
||||
`Mitsubishi_FX5U_firmware_below_1_060_reports_no_server_mode`.
|
||||
|
||||
## Model-specific differences for test coverage
|
||||
|
||||
Summary of which quirks differ per model, so test-class naming can reflect them:
|
||||
|
||||
| Quirk | QJ71MT91 | LJ71MT91 | iQ-R (RJ71EN71 / built-in) | iQ-F (FX5U) | FX3U-ENET(-P502) |
|
||||
|------------------------------------------|----------|----------|----------------------------|-------------|------------------|
|
||||
| FC16 Mask-Write supported | No | No | Yes | Yes | No |
|
||||
| FC17 Read/Write Multiple supported | No | No | Yes | Yes | No |
|
||||
| Max connections | 16 | 16 | 16 | 8 | 8 |
|
||||
| X/Y numbering base | hex | hex | hex | octal (default) | octal |
|
||||
| 32-bit word order | CDAB | CDAB | CDAB (firmware-dependent rumour of ABCD) | CDAB | CDAB |
|
||||
| Port 502 supported | Yes | Yes | Yes | Yes | P502 only |
|
||||
| STOP-mode write permission configurable | Yes | Yes | Yes | Yes | No (always blocks) |
|
||||
| TCP keep-alive parameter | No | No | Yes | Yes | No |
|
||||
| Modbus device assignment — max entries | 16 | 16 | 16 | 16 | 8 |
|
||||
| Server via parameter (no FB) | Yes | Yes | Yes | Yes (fw ≥ 1.060) | Yes |
|
||||
|
||||
- **Test file layout**: `Mitsubishi_QJ71MT91_*`, `Mitsubishi_LJ71MT91_*`,
|
||||
`Mitsubishi_iQR_*`, `Mitsubishi_FX5U_*`, `Mitsubishi_FX3U_ENET_*`,
|
||||
`Mitsubishi_FX3U_ENET_P502_*`. iQ-R built-in Ethernet and the RJ71EN71
|
||||
behave identically for MODBUS/TCP slave purposes and can share a file
|
||||
`Mitsubishi_iQR_*`.
|
||||
- **Cross-model shared tests** (word order CDAB, binary not BCD, standard
|
||||
exception codes, 125-register FC03 cap) can live in a single
|
||||
`Mitsubishi_Common_*` fixture.
|
||||
|
||||
## References
|
||||
|
||||
1. Mitsubishi Electric, *MODBUS Interface Module User's Manual — QJ71MB91*
|
||||
(SH-080578ENG), RS-232/422/485 MODBUS RTU serial module for MELSEC-Q —
|
||||
https://dl.mitsubishielectric.com/dl/fa/document/manual/plc/sh080578eng/sh080578engk.pdf
|
||||
2. Inductive Automation, *Ignition Modbus Driver — Mitsubishi Q / iQ-R word
|
||||
order*, documents CDAB convention —
|
||||
https://docs.inductiveautomation.com/docs/8.1/ignition-modules/opc-ua/drivers/modbus-v2
|
||||
and forum discussion https://forum.inductiveautomation.com/t/modbus-tcp-device-word-byte-order/65984
|
||||
3. Mitsubishi Electric, *Programmable Controller User's Manual QJ71MB91 MODBUS
|
||||
Interface Module*, Chapter 7 "Parameter Setting" describing the Modbus
|
||||
Device Assignment Parameter block (assignments 1-16, head-device
|
||||
configuration) —
|
||||
https://www.lcautomation.com/dbdocument/29156/QJ71MB91%20Users%20manual.pdf
|
||||
4. Mitsubishi Electric, *FX3U-ENET User's Manual* (JY997D18101), Chapter on
|
||||
MODBUS/TCP communication; function code support and connection limits —
|
||||
https://dl.mitsubishielectric.com/dl/fa/document/manual/plc_fx/jy997d18101/jy997d18101h.pdf
|
||||
5. Venus Automation, *Mitsubishi FX3U-ENET-P502 Module — Open Port 502 for
|
||||
Modbus TCP/IP* —
|
||||
https://venusautomation.com.au/mitsubishi-fx3u-enet-p502-module-open-port-502-for-modbus-tcp-ip/
|
||||
and FX3U-ENET-ADP user manual (JY997D45801), which confirms the -ADP
|
||||
variant does not support MODBUS —
|
||||
https://dl.mitsubishielectric.com/dl/fa/document/manual/plc_fx/jy997d45801/jy997d45801h.pdf
|
||||
6. XML Control / Ubidots integration notes, *FX3G Modbus* — port-502 trap,
|
||||
D-register mapping default, word order reference —
|
||||
https://sites.google.com/site/xmlcontrol/archive/fx3g-modbus
|
||||
and https://ubidots.com/blog/mitsubishi-plc-as-modbus-tcp-server/
|
||||
7. FA Support Me, *Modbus TCP on Built-in Ethernet port in iQ-F and iQ-R* —
|
||||
confirms 16-connection limit on iQ-R, 8 on iQ-F, parameter-driven
|
||||
configuration via GX Works3 —
|
||||
https://www.fasupportme.com/portal/en/kb/articles/modbus-tcp-on-build-in-ethernet-port-in-iq-f-and-iq-r-en
|
||||
8. Mitsubishi Electric, *MELSEC iQ-R Ethernet User's Manual (Application)*
|
||||
(SH-081259ENG) and *MELSEC iQ-RJ71EN71 User's Manual* Chapter on
|
||||
"Communications Using Modbus/TCP" —
|
||||
https://www.allied-automation.com/wp-content/uploads/2015/02/MITSUBISHI_manual_plc_iq-r_ethernet_users.pdf
|
||||
and https://www.manualslib.com/manual/1533351/Mitsubishi-Electric-Melsec-Iq-Rj71en71.html?page=109
|
||||
9. Mitsubishi Electric, *MODBUS/TCP Interface Module User's Manual — QJ71MT91*
|
||||
(SH-080446ENG), exception codes page 248, device assignment parameter
|
||||
pages 116-124, duplicate-assignment-disables-slave note —
|
||||
https://dl.mitsubishielectric.com/dl/fa/document/manual/plc/sh080446eng/sh080446engj.pdf
|
||||
10. Mitsubishi Electric, *MELSEC-L Network Features* — LJ71MT91 documented as
|
||||
L-series equivalent of QJ71MT91 with identical MODBUS/TCP behavior —
|
||||
https://us.mitsubishielectric.com/fa/en/products/cnt/programmable-controllers/melsec-l-series/network/features/
|
||||
11. Mitsubishi Electric, *MELSEC iQ-F FX5 User's Manual (MODBUS Communication)*
|
||||
(JY997D56101), Chapter 11 "Modbus/TCP Communication Specifications" —
|
||||
function code max-quantity table, frame specification, device assignment
|
||||
defaults —
|
||||
https://dl.mitsubishielectric.com/dl/fa/document/manual/plcf/jy997d56101/jy997d56101h.pdf
|
||||
12. MrPLC forum, *FX5U Modbus-TCP Server (Slave)*, firmware ≥ 1.60 enables
|
||||
native server via parameter; earlier firmware required function block —
|
||||
https://mrplc.com/forums/topic/31883-fx5u-modbus-tcp-server-slave/
|
||||
and Industrial Monitor Direct's "FX5U MODBUS TCP Server Workaround"
|
||||
article (reflects older firmware behavior) —
|
||||
https://industrialmonitordirect.com/blogs/knowledgebase/mitsubishi-fx5u-modbus-tcp-server-configuration-workaround
|
||||
13. Mitsubishi Electric, *MELSEC iQ-R MODBUS and MODBUS/TCP Reference Manual —
|
||||
RJ71C24 / RJ71C24-R2* (BCN-P5999-1060) — RJ71C24 is serial RTU only,
|
||||
not TCP —
|
||||
https://dl.mitsubishielectric.com/dl/fa/document/manual/plc/bcn-p5999-1060/bcnp59991060b.pdf
|
||||
14. HMS Industrial Networks, *eWON and Mitsubishi FX5U PLC* (KB-0264-00) —
|
||||
documents that FX5U X/Y are octal in GX Works3 but hex when viewed as a
|
||||
Q-series PLC through eWON; the project-level hex/octal toggle —
|
||||
https://hmsnetworks.blob.core.windows.net/www/docs/librariesprovider10/downloads-monitored/manuals/knowledge-base/kb-0264-00-en-ewon-and-mitsubishi-fx5u-plc.pdf
|
||||
15. Fernhill Software, *Mitsubishi Melsec PLC Data Address* — documents
|
||||
hex-vs-octal device numbering split across MELSEC families —
|
||||
https://www.fernhillsoftware.com/help/drivers/mitsubishi-melsec/data-address-format.html
|
||||
16. Inductive Automation support, *Understanding Mitsubishi PLCs* — D registers
|
||||
store signed 16-bit binary, not BCD; DINT combines two consecutive D
|
||||
registers —
|
||||
https://support.inductiveautomation.com/hc/en-us/articles/16517576753165-Understanding-Mitsubishi-PLCs
|
||||
17. Mitsubishi Electric, *FXCPU Structured Programming Manual [Device &
|
||||
Common]* (JY997D26001) — FNC 18 BCD and FNC 19 BIN explicit-conversion
|
||||
instructions confirm binary-by-default storage —
|
||||
https://dl.mitsubishielectric.com/dl/fa/document/manual/plc_fx/jy997d26001/jy997d26001l.pdf
|
||||
121
docs/v2/modbus-test-plan.md
Normal file
121
docs/v2/modbus-test-plan.md
Normal file
@@ -0,0 +1,121 @@
|
||||
# Modbus driver — test plan + device-quirk catalog
|
||||
|
||||
The Modbus TCP driver unit tests (PRs 21–24) cover the protocol surface against an
|
||||
in-memory fake transport. They validate the codec, state machine, and function-code
|
||||
routing against a textbook Modbus server. That's necessary but not sufficient: real PLC
|
||||
populations disagree with the spec in small, device-specific ways, and a driver that
|
||||
passes textbook tests can still misbehave against actual equipment.
|
||||
|
||||
This doc is the harness-and-quirks playbook. The project it describes lives at
|
||||
`tests/ZB.MOM.WW.OtOpcUa.Driver.Modbus.IntegrationTests/` — scaffolded in PR 30 with
|
||||
the simulator fixture, DL205 profile stub, and one write/read smoke test. Each
|
||||
confirmed DL205 quirk lands in a follow-up PR as a named test in that project.
|
||||
|
||||
## Harness
|
||||
|
||||
**Chosen simulator: pymodbus 3.13.0** (`pip install 'pymodbus[simulator]==3.13.0'`).
|
||||
Replaced ModbusPal in PR 43 — see `tests/.../Pymodbus/README.md` for the
|
||||
trade-off rationale. Headline reasons:
|
||||
|
||||
- **Headless** pure-Python CLI; no Java GUI, runs cleanly on a CI runner.
|
||||
- **Maintained** — current stable 3.13.0; ModbusPal 1.6b is abandoned.
|
||||
- **All four standard tables** (HR, IR, coils, DI) configurable; ModbusPal
|
||||
1.6b only exposed HR + coils.
|
||||
- **Built-in actions** (`increment`, `random`, `timestamp`, `uptime`) +
|
||||
optional custom-Python actions for declarative dynamic behaviors.
|
||||
- **Per-register raw uint16 seeding** — encoding the DL205 string-byte-order
|
||||
/ BCD / CDAB-float quirks stays explicit (the quirk math lives in the
|
||||
`_quirk` JSON-comment fields next to each register).
|
||||
- Pip-installable on Windows; sidesteps the privileged-port admin
|
||||
requirement by defaulting to TCP **5020** instead of 502.
|
||||
|
||||
**Setup pattern**:
|
||||
1. `pip install "pymodbus[simulator]==3.13.0"`.
|
||||
2. Start the simulator with one of the in-repo profiles:
|
||||
`tests\.../Pymodbus\serve.ps1 -Profile standard` (or `-Profile dl205`).
|
||||
3. `dotnet test tests\ZB.MOM.WW.OtOpcUa.Driver.Modbus.IntegrationTests` —
|
||||
tests auto-skip when the endpoint is unreachable. Default endpoint is
|
||||
`localhost:5020`; override via `MODBUS_SIM_ENDPOINT` for a real PLC on its
|
||||
native port 502.
|
||||
|
||||
## Per-device quirk catalog
|
||||
|
||||
### AutomationDirect DL205 / DL260
|
||||
|
||||
First known target device family. **Full quirk catalog with primary-source citations
|
||||
and per-quirk integration-test names lives at [`dl205.md`](dl205.md)** — that doc is
|
||||
the reference; this section is the testing roadmap.
|
||||
|
||||
Confirmed quirks (priority order — top items are highest-impact for our driver
|
||||
and ship first as PR 41+):
|
||||
|
||||
| Quirk | Driver impact | Integration-test name |
|
||||
|---|---|---|
|
||||
| **String packing**: 2 chars/register, **first char in low byte** (opposite of generic Modbus) | `ModbusDataType.String` decoder must be configurable per-device family — current code assumes high-byte-first | `DL205_String_low_byte_first_within_register` |
|
||||
| **Word order CDAB** for Int32/UInt32/Float32 | Already configurable via `ModbusByteOrder.WordSwap`; default per device profile | `DL205_Int32_word_order_is_CDAB` |
|
||||
| **BCD-as-default** numeric storage (only IEEE 754 when ladder uses `R` type) | New decoder mode — register reads as `0x1234` for ladder value `1234`, not as decimal `4660` | `DL205_BCD_register_decodes_as_hex_nibbles` |
|
||||
| **FC16 capped at 100 registers** (below the spec's 123) | Bulk-write batching must cap per-device-family | `DL205_FC16_101_registers_returns_IllegalDataValue` |
|
||||
| **FC03/04 capped at 128** (above the spec's 125) | Less impactful — clients that respect the spec's 125 stay safe | `DL205_FC03_129_registers_returns_IllegalDataValue` |
|
||||
| **V-memory octal-to-decimal addressing** (V2000 octal → 0x0400 decimal) | New address-format helper in profile config so operators can write `V2000` instead of computing `1024` themselves | `DL205_Vmem_V2000_maps_to_PDU_0x0400` |
|
||||
| **C-relay → coil 3072 / Y-output → coil 2048** offsets | Hard-coded constants in DL205 device profile | `DL205_C0_maps_to_coil_3072`, `DL205_Y0_maps_to_coil_2048` |
|
||||
| **Register 0 is valid** (rejects-register-0 rumour was DL05/DL06 relative-mode artefact) | None — current default is safe | `DL205_FC03_register_0_returns_V0_contents` |
|
||||
| **Max 4 simultaneous TCP clients** on H2-ECOM100 | Connect-time: handle TCP-accept failure with a clearer error message | `DL205_5th_TCP_connection_refused` |
|
||||
| **No TCP keepalive** | Driver-side periodic-probe (already wired via `IHostConnectivityProbe`) | _Covered by existing `ModbusProbeTests`_ |
|
||||
| **No mid-stream resync on malformed MBAP** | Already covered — single-flight + reconnect-on-error | _Covered by existing `ModbusDriverTests`_ |
|
||||
| **Write-protect exception code: `02` newer / `04` older** | Translate either to `BadNotWritable` | `DL205_FC06_in_ProgramMode_returns_ServerFailure` |
|
||||
|
||||
_Operator-reported / unconfirmed_ — covered defensively in the driver but no
|
||||
integration tests until reproduced on hardware:
|
||||
- TxId drop under load (forum rumour; not reproduced).
|
||||
- Pre-2004 firmware ABCD word order (every shipped DL205/DL260 since 2004 is CDAB).
|
||||
|
||||
### Future devices
|
||||
|
||||
One section per device class, same shape as DL205. Quirks that apply across
|
||||
multiple devices (e.g., "all AB PLCs use CDAB") can be noted in the cross-device
|
||||
patterns section below once we have enough data points.
|
||||
|
||||
## Cross-device patterns
|
||||
|
||||
Once multiple device catalogs accumulate, quirks that recur across two or more
|
||||
vendors get promoted into driver defaults or opt-in options:
|
||||
|
||||
- _(empty — filled in as catalogs grow)_
|
||||
|
||||
## Test conventions
|
||||
|
||||
- **One named test per quirk.** `DL205_word_order_is_CDAB_for_Float32` is easier to
|
||||
diagnose on failure than a generic `Float32_roundtrip`. The `DL205_` prefix makes
|
||||
filtering by device class trivial (`--filter "DisplayName~DL205"`).
|
||||
- **Skip with a clear SkipReason.** Follow the pattern from
|
||||
`GalaxyRepositoryLiveSmokeTests`: check reachability in the fixture, capture
|
||||
a `SkipReason` string, and have each test call `Assert.Skip(SkipReason)` when
|
||||
it's set. Don't throw — skipped tests read cleanly in CI logs.
|
||||
- **Use the real `ModbusTcpTransport`.** Integration tests exercise the wire
|
||||
protocol end-to-end. The in-memory `FakeTransport` from the unit test suite is
|
||||
deliberately not used here — its value is speed + determinism, which doesn't
|
||||
help reproduce device-specific issues.
|
||||
- **Don't depend on simulator state between tests.** Each test resets the
|
||||
simulator's register bank or uses a unique address range. Avoid relying on
|
||||
"previous test left value at register 10" setups that flake when tests run in
|
||||
parallel or re-order. Either the test mutates the scratch ranges and restores
|
||||
on finally, or it uses pymodbus's REST API to reset state between facts.
|
||||
|
||||
## Next concrete PRs
|
||||
|
||||
- **PR 30 — Integration test project + DL205 profile scaffold** — **DONE**.
|
||||
Shipped `tests/ZB.MOM.WW.OtOpcUa.Driver.Modbus.IntegrationTests` with
|
||||
`ModbusSimulatorFixture` (TCP-probe, skips with a clear `SkipReason` when the
|
||||
endpoint is unreachable), `DL205/DL205Profile.cs` (tag map stub), and
|
||||
`DL205/DL205SmokeTests.cs` (write-then-read round-trip).
|
||||
- **PR 41 — DL205 quirk catalog doc** — **DONE**. `docs/v2/dl205.md`
|
||||
documents every DL205/DL260 Modbus divergence with primary-source citations.
|
||||
- **PR 42 — ModbusPal `.xmpp` profiles** — **SUPERSEDED by PR 43**. Replaced
|
||||
with pymodbus JSON because ModbusPal 1.6b is abandoned, GUI-only, and only
|
||||
exposes 2 of the 4 standard tables.
|
||||
- **PR 43 — pymodbus JSON profiles** — **DONE**. `Pymodbus/standard.json` +
|
||||
`Pymodbus/dl205.json` + `Pymodbus/serve.ps1` runner. Both bind TCP 5020.
|
||||
- **PR 44+**: one PR per confirmed DL205 quirk, landing the named test + any
|
||||
driver-side adjustment (string byte order, BCD decoder, V-memory address
|
||||
helper, FC16 cap-per-device-family) needed to pass it. Each quirk's value
|
||||
is already pre-encoded in `Pymodbus/dl205.json`.
|
||||
@@ -909,6 +909,26 @@ Each step leaves the system runnable. The generic extraction is effectively free
|
||||
| 140 | Enterprise shortname = `zb` (UNS level-1 segment) | Closes corrections-doc D4. Matches the existing `ZB.MOM.WW.*` namespace prefix used throughout the codebase; short by design since this segment appears in every equipment path (`zb/warsaw-west/bldg-3/line-2/cnc-mill-05/RunState`); operators already say "ZB" colloquially. Admin UI cluster-create form default-prefills `zb` for the Enterprise field. Production deployments use it directly from cluster-create | 2026-04-17 |
|
||||
| 141 | Tier 3 (AppServer IO) cutover is feasible — AVEVA's OI Gateway supports arbitrary upstream OPC UA servers as a documented pattern | Closes corrections-doc E2 with **GREEN-YELLOW** verdict. Multiple AVEVA partners (Software Toolbox, InSource) have published working integrations against four different non-AVEVA upstream servers (TOP Server, OPC Router, OmniServer, Cogent DataHub). No re-architecting of OtOpcUa required. Path: `OPC UA node → OI Gateway → SuiteLink → $DDESuiteLinkDIObject → AppServer attribute`. Recommended AppServer floor: System Platform 2023 R2 Patch 01. Two integrator-burden risks tracked: validation/GxP paperwork (no AVEVA blueprint exists for non-AVEVA upstream servers in Part 11 deployments) and unpublished scale benchmarks (in-house benchmark required before cutover scheduling). See `aveva-system-platform-io-research.md` | 2026-04-17 |
|
||||
| 142 | Phase 1 acceptance includes an end-to-end AppServer-via-OI-Gateway smoke test against OtOpcUa | Catches AppServer-specific quirks (cert exchange via reject-and-trust workflow, endpoint URL must NOT include `/discovery` suffix per Inductive Automation forum failure mode, service-account install required because OI Gateway under SYSTEM cannot connect to remote OPC servers, `Basic256Sha256` + `SignAndEncrypt` + LDAP-username token combination must work end-to-end) early — well before the Year 3 tier-3 cutover schedule. Adds one task to `phase-1-configuration-and-admin-scaffold.md` Stream E (Admin smoke test) | 2026-04-17 |
|
||||
| 143 | Polly per-capability policy — Read / HistoryRead / Discover / Probe / Alarm-subscribe auto-retry; Write does NOT auto-retry unless the tag metadata carries `[WriteIdempotent]` | Decisions #44-45 forbid auto-retry on Write because a timed-out write can succeed on the device + be replayed by the pipeline, duplicating pulses / alarm acks / counter increments / recipe-step advances. Per-capability policy in the shared Polly layer makes the retry safety story explicit; `WriteIdempotentAttribute` on tag definitions is the opt-in surface | 2026-04-19 |
|
||||
| 144 | Polly pipeline key = `(DriverInstanceId, HostName)`, not DriverInstanceId alone | Decision #35 requires per-device isolation. One dead PLC behind a multi-device Modbus driver must NOT open the circuit breaker for healthy sibling hosts. Per-instance pipelines would poison every device behind one bad endpoint | 2026-04-19 |
|
||||
| 145 | Tier A/B/C runtime enforcement splits into `MemoryTracking` (all tiers — soft/hard thresholds log + surface, NEVER kill) and `MemoryRecycle` (Tier C only — requires out-of-process topology). Tier A/B hard-breach logs a promotion-to-Tier-C recommendation; the runtime never auto-kills an in-process driver | Decisions #73-74 reserve process-kill protections for Tier C. An in-process Tier A/B "recycle" would kill every OPC UA session + every other in-proc driver for one leaky instance, blast-radius worse than the leak | 2026-04-19 |
|
||||
| 146 | Memory watchdog uses the hybrid formula `soft = max(multiplier × baseline, baseline + floor)`, with baseline captured as the median of the first 5 min of `GetMemoryFootprint()` samples post-InitializeAsync. Tier-specific constants: A multiplier=3 floor=50 MB, B multiplier=3 floor=100 MB, C multiplier=2 floor=500 MB. Hard = 2 × soft | Codex adversarial review on the Phase 6.1 plan flagged that hardcoded per-tier MB bands diverge from decision #70's specified formula. Static bands false-trigger on small-footprint drivers + miss meaningful growth on large ones. Observed-baseline + hybrid formula recovers the original intent | 2026-04-19 |
|
||||
| 147 | `WedgeDetector` uses demand-aware criteria `(state==Healthy AND hasPendingWork AND noProgressIn > threshold)`. `hasPendingWork` = (Polly bulkhead depth > 0) OR (active MonitoredItem count > 0) OR (queued historian read count > 0). Idle + subscription-only + write-only-burst drivers stay Healthy without false-fault | Previous "no successful Read in N intervals" formulation flipped legitimate idle subscribers, slow historian backfills, and write-heavy drivers to Faulted. The demand-aware check only fires when the driver claims work is outstanding | 2026-04-19 |
|
||||
| 148 | LiteDB config cache is **generation-sealed**: `sp_PublishGeneration` writes `<cache-root>/<cluster>/<generationId>.db` as a read-only sealed file; cache reads serve the last-known-sealed generation. Mixed-generation reads are impossible | Prior "refresh on every successful query" cache could serve LDAP role mapping from one generation alongside UNS topology from another, producing impossible states. Sealed-snapshot invariant keeps cache-served reads coherent with a real published state | 2026-04-19 |
|
||||
| 149 | `AuthorizationDecision { Allow \| NotGranted \| Denied, IReadOnlyList<MatchedGrant> Provenance }` — tri-state internal model. Phase 6.2 only produces `Allow` + `NotGranted` (grant-only semantics per decision #129); v2.1 Deny widens without API break | bool return would collapse `no-matching-grant` and `explicit-deny` into the same runtime state + UI explanation; provenance record is needed for the audit log anyway. Making the shape tri-state from Phase 6.2 avoids a breaking change in v2.1 | 2026-04-19 |
|
||||
| 150 | Data-plane ACL evaluator consumes `NodeAcl` rows joined against the session's resolved LDAP group memberships. `LdapGroupRoleMapping` (decision #105) is control-plane only — routes LDAP groups to Admin UI roles. Zero runtime overlap between the two | Codex adversarial review flagged that Phase 6.2 draft conflated the two — building the data-plane trie from `LdapGroupRoleMapping` would let a user inherit tag permissions from an admin-role claim path never intended as a data-path grant | 2026-04-19 |
|
||||
| 151 | `UserAuthorizationState` cached per session but bounded by `MembershipFreshnessInterval` (default 15 min). Past that interval the next hot-path authz call re-resolves LDAP group memberships; failure to re-resolve (LDAP unreachable) → fail-closed (evaluator returns `NotGranted` until memberships refresh successfully) | Previous design cached memberships until session close, so a user removed from a privileged LDAP group could keep authorized access for hours. Bounded freshness + fail-closed covers the revoke-takes-effect story | 2026-04-19 |
|
||||
| 152 | Auth cache has its own staleness budget `AuthCacheMaxStaleness` (default 5 min), independent of decision #36's availability-oriented config cache (24 h). Past 5 min on authorization data, evaluator fails closed regardless of whether the underlying config is still serving from cache | Availability-oriented caches trade correctness for uptime. Authorization data is correctness-sensitive — stale ACLs silently extend revoked access. Auth-specific budget keeps the two concerns from colliding | 2026-04-19 |
|
||||
| 153 | MonitoredItem carries `(AuthGenerationId, MembershipVersion)` stamp at create time. On every Publish, items with a mismatching stamp re-evaluate; unchanged items stay fast-path. Revoked items drop to `BadUserAccessDenied` within one publish cycle | Create-time-only authorization leaves revoked users receiving data forever; per-publish re-authorization at 100 ms cadence across 50 groups × 6 levels is too expensive. Stamp-then-reevaluate-on-change balances correctness with cost | 2026-04-19 |
|
||||
| 154 | ServiceLevel reserves `0` for operator-declared maintenance only; `1` = NoData (unreachable / Faulted); operational states occupy `2..255` in an 8-state matrix (Authoritative-Primary=255, Isolated-Primary=230, Primary-Mid-Apply=200, Recovering-Primary=180, Authoritative-Backup=100, Isolated-Backup=80, Backup-Mid-Apply=50, Recovering-Backup=30, InvalidTopology=2) | OPC UA Part 5 §6.3.34 defines `0=Maintenance` + `1=NoData`; using `0` for our Faulted case collides with spec + triggers spec-compliant clients to enter maintenance-mode cutover. Expanded 8-state matrix covers operational states the 5-state original collapsed together (e.g. Isolated-Primary vs Primary-Mid-Apply were both 200) | 2026-04-19 |
|
||||
| 155 | `ServerUriArray` includes self + peers (self first, deterministic ordering), per OPC UA Part 4 §6.6.2.2 | Previous design excluded self from the array — spec violation + clients lose the ability to map server identities consistently during failover | 2026-04-19 |
|
||||
| 156 | Redundancy peer health uses a two-layer probe: `/healthz` (2 s) as fast-fail + `UaHealthProbe` (10 s, opens OPC UA client session to peer + reads its `ServiceLevel` node) as the authority signal. HTTP-healthy ≠ UA-authoritative | `/healthz` returns 200 whenever HTTP + config DB/cache is healthy — but a peer can be HTTP-healthy with a broken OPC UA endpoint or a stuck subscription publisher. Using HTTP alone would advertise authority against servers that can't actually publish data | 2026-04-19 |
|
||||
| 157 | Publish-generation fencing — coordinator CAS on a monotonic `ConfigGenerationId`; every topology + role decision is generation-stamped; peers reject state propagated from a lower generation. Runtime `InvalidTopology` state (both self-demote to ServiceLevel 2) when >1 Primary detected post-startup | Operator race publishing two drafts with different roles can produce two locally-valid views; without fencing + runtime containment both nodes can serve as Primary until manual intervention | 2026-04-19 |
|
||||
| 158 | Apply-window uses named leases keyed to `(ConfigGenerationId, PublishRequestId)` via `await using`. `ApplyLeaseWatchdog` auto-closes any lease older than `ApplyMaxDuration` (default 10 min) | Simple `IDisposable`-counter design leaks on cancellation / async-ownership races; a stuck positive count leaves the node permanently mid-apply. Generation-keyed leases + watchdog bound worst case | 2026-04-19 |
|
||||
| 159 | CSV import header row must start with `# OtOpcUaCsv v1` (version marker). Future shape changes bump the version; parser forks per version. Canonical identifier columns follow decision #117: `ZTag, MachineCode, SAPID, EquipmentId, EquipmentUuid` | Without a version marker the CSV schema has no upgrade path — adding a required column breaks every old export silently. The version prefix makes parser dispatch explicit + future-compatible | 2026-04-19 |
|
||||
| 160 | Equipment CSV import uses a staged-import pattern: `EquipmentImportBatch` + `EquipmentImportRow` tables receive chunked inserts; `FinaliseImportBatch` is one atomic transaction that applies accepted rows to `Equipment` + `ExternalIdReservation`. Rollback = drop the batch row; `Equipment` never partially mutates | 10k-row single-transaction import holds locks too long; chunked direct writes lose all-or-nothing rollback. Staging + atomic finalize bounds transaction duration + preserves rollback semantics | 2026-04-19 |
|
||||
| 161 | UNS drag-reorder impact preview carries a `DraftRevisionToken`; Confirm re-checks against the current draft + returns `409 Conflict / refresh-required` if the draft advanced between preview and commit | Without concurrency control, two operators editing the same draft can overwrite each other's changes silently. Draft-revision token + 409 response makes the race visible + forces refresh | 2026-04-19 |
|
||||
| 162 | OPC 40010 Identification sub-folder exposed under each equipment node inherits the Equipment scope's ACL grants — the ACL trie does NOT add a new scope level for Identification | Adding a new scope level for Identification would require every grant to add a second grant for `Equipment/Identification`; inheriting the Equipment scope keeps the grant model flat + prevents operator-forgot-to-grant-Identification access surprises | 2026-04-19 |
|
||||
|
||||
## Reference Documents
|
||||
|
||||
|
||||
485
docs/v2/s7.md
Normal file
485
docs/v2/s7.md
Normal file
@@ -0,0 +1,485 @@
|
||||
# Siemens SIMATIC S7 (S7-1200 / S7-1500 / S7-300 / S7-400 / ET 200SP) — Modbus TCP quirks
|
||||
|
||||
Siemens S7 PLCs do *not* speak Modbus TCP natively at the OS/firmware level. Every
|
||||
S7 Modbus-TCP-server deployment is either (a) the **`MB_SERVER`** library block
|
||||
running on the CPU's PROFINET port (S7-1200 / S7-1500 / CPU 1510SP-series
|
||||
ET 200SP), or (b) the **`MODBUSCP`** function block running on a separate
|
||||
communication processor (**CP 343-1 / CP 343-1 Lean** on S7-300, **CP 443-1** on
|
||||
S7-400), or (c) the **`MODBUSPN`** block on an S7-1500 PN port via a licensed
|
||||
library. That means the quirks a Modbus client has to cope with are as much
|
||||
"this is how the user's PLC programmer wired the library block up" as "this is
|
||||
how the firmware behaves" — the byte-order and coil-mapping rules aren't
|
||||
hard-wired into silicon like they are on a DL260. This document catalogues the
|
||||
behaviours a driver has to handle across the supported model/CP variants, cites
|
||||
primary sources, and names the ModbusPal integration test we'd write for each
|
||||
(convention from `docs/v2/modbus-test-plan.md`: `S7_<model>_<behavior>`).
|
||||
|
||||
## Model / CP Capability Matrix
|
||||
|
||||
| PLC family | Modbus TCP server mechanism | Modbus TCP client mechanism | License required? | Typical port 502 source |
|
||||
|---------------------|------------------------------------|------------------------------------|-----------------------|-----------------------------------------------------------|
|
||||
| S7-1200 (V4.0+) | `MB_SERVER` on integrated PN port | `MB_CLIENT` | No (in TIA Portal) | CPU's onboard Ethernet [1][2] |
|
||||
| S7-1500 (all) | `MB_SERVER` on integrated PN port | `MB_CLIENT` | No (in TIA Portal) | CPU's onboard Ethernet [1][3] |
|
||||
| S7-1500 + CP 1543-1 | `MB_SERVER` on CP's IP | `MB_CLIENT` | No | Separate CP IP address [1] |
|
||||
| ET 200SP CPU (1510SP, 1512SP) | `MB_SERVER` on PN port | `MB_CLIENT` | No | CPU's onboard Ethernet [3] |
|
||||
| S7-300 + CP 343-1 / CP 343-1 Lean | `MODBUSCP` (FB `MODBUSCP`, instance DB per connection) | Same FB, client mode | **Yes — 2XV9450-1MB00** per CP | CP's Ethernet port [4][5] |
|
||||
| S7-400 + CP 443-1 | `MODBUSCP` | `MODBUSCP` client mode | **Yes — 2XV9450-1MB00** per CP | CP's Ethernet port [4] |
|
||||
| S7-400H + CP 443-1 (redundant H) | `MODBUSCP_REDUNDANT` / paired FBs | Not typical | Yes | Paired CPs in H-system [6] |
|
||||
| S7-300 / S7-400 CPU PN (e.g. CPU 315-2 PN/DP) | `MODBUSPN` library | `MODBUSPN` client mode | **Yes** — Modbus-TCP PN CPU lib | CPU's PN port [7] |
|
||||
| "CP 343-1 Lean" | **Server only** (no client mode supported by Lean) | — | Yes, but with restrictions | CP's Ethernet port [4][5] |
|
||||
|
||||
- **CP 343-1 Lean is server-only.** It can host `MODBUSCP` in server mode only;
|
||||
client calls return an immediate error. A surprising number of "Lean + client
|
||||
doesn't work" forum posts trace back to this [5].
|
||||
- **Pure OPC UA / PROFINET CPs (CP 1542SP-1, CP 1543-1)** support Modbus TCP on
|
||||
S7-1500 via the same `MB_SERVER`/`MB_CLIENT` instructions by passing the
|
||||
CP's `hw_identifier`. There is no separate "Modbus CP" license needed on
|
||||
S7-1500, unlike S7-300/400 [1].
|
||||
- **No S7 Modbus server supports function codes 20/21 (file records),
|
||||
22 (mask write), 23 (read-write multiple), or 43 (device identification).**
|
||||
Sending any of these returns exception `01` (Illegal Function) on every S7
|
||||
variant [1][4]. Our driver must not negotiate FC23 as a "bulk-read optimization"
|
||||
when the profile is S7.
|
||||
|
||||
Test names:
|
||||
`S7_1200_MBSERVER_Loads_OB1_Cyclic`,
|
||||
`S7_CP343_Lean_Client_Mode_Rejected`,
|
||||
`S7_All_FC23_Returns_IllegalFunction`.
|
||||
|
||||
## Address / DB Mapping
|
||||
|
||||
S7 Modbus servers **do not auto-expose PLC memory** — the PLC programmer has to
|
||||
wire one area per Modbus table to a DB or process-image region. This is the
|
||||
single biggest difference vs. DL205/Modicon/etc., where the memory map is
|
||||
fixed at the factory. Our driver must therefore be tolerant of "the same
|
||||
`40001` means completely different things on two S7-1200s on the same site."
|
||||
|
||||
### S7-1200 / S7-1500 `MB_SERVER`
|
||||
|
||||
The `MB_SERVER` instance exposes four Modbus tables to each connected client;
|
||||
each table's backing storage is a per-block parameter [1][8]:
|
||||
|
||||
| Modbus table | FCs | Backing parameter | Default / typical backing |
|
||||
|---------------------|-------------|-----------------------------|-----------------------------|
|
||||
| Coils (0x) | FC01, FC05, FC15 | *implicit* — Q process image | `%Q0.0`–`%Q1023.7` (→ coil addresses 0–8191) [1][9] |
|
||||
| Discrete Inputs (1x)| FC02 | *implicit* — I process image | `%I0.0`–`%I1023.7` (→ discrete addresses 0–8191) [1][9] |
|
||||
| Input Registers (3x)| FC04 | *implicit* — M memory or DB (version-dependent) | Some firmware routes FC04 through the same MB_HOLD_REG buffer [1][8] |
|
||||
| Holding Registers (4x)| FC03, FC06, FC16 | `MB_HOLD_REG` pointer | User DB (e.g. `DB10.DBW0`) or `%MW` area [1][2][8] |
|
||||
|
||||
- **`MB_HOLD_REG` is a pointer (VARIANT / ANY) into a user-defined DB** whose
|
||||
first byte is holding-register 0 (`40001` in 1-based Modicon form). Byte
|
||||
offset 2 is register 1, byte offset 4 is register 2, etc. [1][2].
|
||||
- **The DB *must* have "Optimized block access" UNCHECKED.** Optimized DBs let
|
||||
the compiler reorder fields for alignment; Modbus requires fixed byte
|
||||
offsets. With optimized access on, the compiler accepts the project but
|
||||
`MB_SERVER` returns STATUS `0x8383` (misaligned access) or silently reads
|
||||
zeros [8][10][11]. This is the #1 support-forum complaint.
|
||||
- **FC01/FC02/FC05/FC15 hit the Q and I process images directly — not the
|
||||
`MB_HOLD_REG` DB.** Coil address 0 = `%Q0.0`, coil 1 = `%Q0.1`, coil 8 =
|
||||
`%Q1.0`. The S7-1200 system manual publishes this mapping as `00001 → Q0.0`
|
||||
through `09999 → Q1023.7` and `10001 → I0.0` through `19999 → I1023.7` in
|
||||
1-based form; on the wire (0-based) that's coils 0-8191 and discrete inputs
|
||||
0-8191 [9].
|
||||
- **`%M` markers are NOT automatically exposed.** To expose `%M` over Modbus
|
||||
the programmer must either (a) copy `%M` to the `MB_HOLD_REG` DB each scan,
|
||||
or (b) define an Array\[0..n\] of Bool inside that DB and copy bits in/out
|
||||
of `%M`. Siemens has no "MB_COIL_REG" parameter analogous to
|
||||
`MB_HOLD_REG` — this confuses users migrating from Schneider [9][12].
|
||||
- **Bit ordering within a Modbus holding register sourced from an `Array of
|
||||
Bool`**: S7 stores bool\[0\] at `DBX0.0` which is bit 0 of byte 0 which is
|
||||
the **low byte, low bit** of Modbus register `40001`. A naive client that
|
||||
reads register `40001` and masks `0x0001` gets bool\[0\]. A client that
|
||||
masks `0x8000` gets bool\[15\] because the high byte of the Modbus register
|
||||
is the *second* byte of the DB. Siemens programmers routinely get this
|
||||
wrong in the DB-via-DBX form; `Array[0..n] of Bool` is the recommended
|
||||
layout because it aligns naturally [12][13].
|
||||
|
||||
### S7-300/400 + CP 343-1 / CP 443-1 `MODBUSCP`
|
||||
|
||||
Different paradigm: per-connection **parameter DB** (template
|
||||
`MODBUS_PARAM_CP`) declares a table of up to 8 register-area mappings. Each
|
||||
mapping is a tuple `(data_type, DB#, start_offset, length)` where `data_type`
|
||||
picks the Modbus table [4]:
|
||||
|
||||
- `B#16#1` = Coils
|
||||
- `B#16#2` = Discrete Inputs
|
||||
- `B#16#3` = Holding Registers
|
||||
- `B#16#4` = Input Registers
|
||||
|
||||
The `holding_register_start` and analogous `coils_start` parameters declare
|
||||
**which Modbus address range** the CP will serve, and the DB pointers say
|
||||
where in S7 memory that range lives [4][14]. Unlike `MB_SERVER`, the CP does
|
||||
not reach into `%Q`/`%I` directly — *everything* goes through a DB. If an
|
||||
address outside the declared ranges is requested, the CP returns exception
|
||||
`02` (Illegal Data Address) [4].
|
||||
|
||||
Test names:
|
||||
`S7_1200_FC03_Reg0_Reads_DB10_DBW0`,
|
||||
`S7_1200_Optimized_DB_Returns_0x8383_MisalignedAccess`,
|
||||
`S7_1200_FC01_Coil0_Reads_Q0_0`,
|
||||
`S7_CP343_FC03_Outside_ParamBlock_Range_Returns_IllegalDataAddress`.
|
||||
|
||||
## Data Types and Byte Order
|
||||
|
||||
Siemens CPUs store scalars **big-endian** internally ("Motorola format"), which
|
||||
is the same byte order Modbus specifies inside each register. So for 16-bit
|
||||
values (`Int`, `Word`, `UInt`) the on-the-wire layout is straightforward
|
||||
`AB` — high byte of the PLC value in the high byte of the Modbus register
|
||||
[15][16]. No byte-swap trap for 16-bit types.
|
||||
|
||||
The trap is 32-bit types (`DInt`, `DWord`, `Real`). Here's what actually
|
||||
happens across the S7 family:
|
||||
|
||||
### S7-1200 / S7-1500 `MB_SERVER`
|
||||
|
||||
- **The backing DB stores 32-bit values in big-endian byte order, high word
|
||||
first** — i.e. `ABCD` when viewed as two consecutive Modbus registers. A
|
||||
`Real` at `DB10.DBD0` with value `0x12345678` reads over Modbus as
|
||||
register 0 = `0x1234`, register 1 = `0x5678` [15][16][17].
|
||||
- **This is `ABCD`, *not* `CDAB`.** Clients that hard-code CDAB (common default
|
||||
for meters and VFDs) will get wildly wrong floats. Configure the S7 profile
|
||||
with `WordOrder = ABCD` (aka "big-endian word + big-endian byte" aka
|
||||
"high-word first") [15][17].
|
||||
- **`MB_SERVER` does not swap.** It's a direct memcpy from the DB bytes to
|
||||
the Modbus payload. Whatever byte order the ladder programmer stored into
|
||||
the DB is what the client receives [17]. This means a programmer who used
|
||||
`MOVE_BLK` from two separate `Word`s into `DBD` with the "wrong" order can
|
||||
produce `CDAB` without realising.
|
||||
- **`Real` is IEEE 754 single-precision** — unambiguous, no BCD trap like on
|
||||
DL series [15].
|
||||
- **Strings**: S7 `String[n]` has a 2-byte header (max length, current length)
|
||||
*before* the character bytes. A client reading a string over Modbus gets
|
||||
the header in the first register and then the characters two-per-register
|
||||
in high-byte-first order. `WString` is UTF-16 and the header is 4 bytes
|
||||
[18]. Our driver's string decoder must expose the "skip header" option for
|
||||
S7 profile.
|
||||
|
||||
### S7-300/400 `MODBUSCP` (CP 343-1 / CP 443-1)
|
||||
|
||||
- The CP writes the exact DB bytes onto the wire — again `ABCD` if the DB
|
||||
stores `DInt`/`Real` in native Siemens order [4].
|
||||
- **`MODBUSCP` has no `data_type` byte-swap knob.** (The `data_type` parameter
|
||||
names the Modbus table, not the byte order — see the Address Mapping
|
||||
section.) If the other end of the link expects `CDAB`, the programmer has
|
||||
to swap words in ladder before writing the DB [4][14].
|
||||
|
||||
### Operator-reported oddity
|
||||
|
||||
- Some S7 drivers (Kepware's "Siemens TCP/IP Ethernet" driver, Ignition's
|
||||
"Siemens S7" driver) expose a per-tag `Float Byte Order` with options
|
||||
`ABCD`/`CDAB`/`BADC`/`DCBA` because end-users have encountered every
|
||||
permutation in the field — not because the PLC natively swaps, but because
|
||||
ladder programmers have historically stored floats every which way [19].
|
||||
Our S7 Modbus profile should default to `ABCD` but expose a per-tag
|
||||
override.
|
||||
- **Unconfirmed rumour**: that S7-1500 firmware V2.0+ reverses float byte
|
||||
order for `MB_CLIENT` only. Not reproduced; the Siemens forum thread that
|
||||
launched it was a user error (the remote server was the swapper, not the
|
||||
S7) [20]. Treat as false until proven.
|
||||
|
||||
Test names:
|
||||
`S7_1200_Real_WordOrder_ABCD_Default`,
|
||||
`S7_1200_DInt_HighWord_First_At_DBD0`,
|
||||
`S7_1200_String_Header_First_Two_Bytes`,
|
||||
`S7_CP343_No_Internal_ByteSwap`.
|
||||
|
||||
## Coil / Discrete Input Mapping
|
||||
|
||||
On `MB_SERVER` the mapping from coil address → S7 bit is fixed at the
|
||||
process-image level [1][9][12]:
|
||||
|
||||
| Modbus coil / discrete input addr | S7 address | Notes |
|
||||
|-----------------------------------|---------------|-------------------------------------|
|
||||
| Coil 0 (FC01/05/15) | `%Q0.0` | bit 0 of output byte 0 |
|
||||
| Coil 7 | `%Q0.7` | bit 7 of output byte 0 |
|
||||
| Coil 8 | `%Q1.0` | bit 0 of output byte 1 |
|
||||
| Coil 8191 (max) | `%Q1023.7` | highest exposed output bit |
|
||||
| Discrete input 0 (FC02) | `%I0.0` | bit 0 of input byte 0 |
|
||||
| Discrete input 8191 | `%I1023.7` | highest exposed input bit |
|
||||
|
||||
Formulas:
|
||||
|
||||
```
|
||||
coil_addr = byte_index * 8 + bit_index (e.g. %Q5.3 → coil 43)
|
||||
discr_addr = byte_index * 8 + bit_index (e.g. %I10.2 → disc 82)
|
||||
```
|
||||
|
||||
- **1-based Modicon form adds 1:** coil 0 (wire) = `00001` (Modicon), etc.
|
||||
Our driver sends the 0-based PDU form, so `%Q0.0` writes to wire address 0.
|
||||
- **Writing FC05/FC15 to `%Q` is accepted even while the CPU is in STOP** —
|
||||
the PLC's process image doesn't care about the user program state. But the
|
||||
output won't propagate to the physical module until RUN (see STOP section
|
||||
below) [1][21].
|
||||
- **`%M` markers require a DB-backed `Array of Bool`** as described in the
|
||||
Address Mapping section. Our driver can't assume "coil N = MN.0" like it
|
||||
can on Modicon — on S7 it's always Q/I unless the programmer built a
|
||||
mapping DB [12].
|
||||
- **Bit-inside-holding-register**: for `Array of Bool` inside the
|
||||
`MB_HOLD_REG` DB, bool[0] is bit 0 of byte 0 → **low byte, low bit** of
|
||||
Modbus register 40001. Most third-party clients probe this in the low
|
||||
byte, so the common case works; the less-common case (bool[8]) is bit 0 of
|
||||
byte 1 → **high byte, low bit** of Modbus register 40001. Clients that
|
||||
test only bool[0] will pass and miss the mis-alignment on bool[8] [12][13].
|
||||
|
||||
Test names:
|
||||
`S7_1200_Coil_0_Is_Q0_0`,
|
||||
`S7_1200_Coil_8_Is_Q1_0`,
|
||||
`S7_1200_Discrete_Input_7_Is_I0_7`,
|
||||
`S7_1200_Coil_Write_In_STOP_Accepted_But_Output_Frozen`.
|
||||
|
||||
## Function Code Support & Max Registers Per Request
|
||||
|
||||
| FC | Name | S7-1200 / S7-1500 MB_SERVER | CP 343-1 / CP 443-1 MODBUSCP | Max qty per request |
|
||||
|----|----------------------------|-----------------------------|------------------------------|--------------------------------|
|
||||
| 01 | Read Coils | Yes | Yes | 2000 bits (spec) |
|
||||
| 02 | Read Discrete Inputs | Yes | Yes | 2000 bits (spec) |
|
||||
| 03 | Read Holding Registers | Yes | Yes | **125** (spec max) |
|
||||
| 04 | Read Input Registers | Yes | Yes | **125** |
|
||||
| 05 | Write Single Coil | Yes | Yes | 1 |
|
||||
| 06 | Write Single Register | Yes | Yes | 1 |
|
||||
| 15 | Write Multiple Coils | Yes | Yes | 1968 bits (spec) — *see note* |
|
||||
| 16 | Write Multiple Registers | Yes | Yes | **123** (spec max for TCP) |
|
||||
| 07 | Read Exception Status | No (RTU only) | No | — |
|
||||
| 17 | Report Server ID | No | No | — |
|
||||
| 20/21 | Read/Write File Record | No | No | — |
|
||||
| 22 | Mask Write Register | No | No | — |
|
||||
| 23 | Read/Write Multiple | No | No | — |
|
||||
| 43 | Read Device Identification | No | No | — |
|
||||
|
||||
- **S7-1200/1500 honour the full spec maxima** for FC03/04 (125) and FC16
|
||||
(123) [1][22]. No sub-spec cap like DL260's 100-register FC16 limit.
|
||||
- **FC15 (Write Multiple Coils) on `MB_SERVER`** writes into `%Q`, which maxes
|
||||
out at 1024 bytes = 8192 bits, but the spec's 1968-bit per-request limit
|
||||
caps any single call first [1][9].
|
||||
- **`MB_HOLD_REG` buffer size is bounded by DB size** — max DB size on
|
||||
S7-1200 is 64 KB, on S7-1500 is much larger (several MB depending on CPU),
|
||||
so the practical `MB_HOLD_REG` limit is 32767 16-bit registers on S7-1200
|
||||
and effectively unbounded on S7-1500 [22][23]. The *per-request* limit is
|
||||
still 125.
|
||||
- **Read past the end of `MB_HOLD_REG`** returns exception `02` (Illegal
|
||||
Data Address) at the start of the overflow register, not a partial read
|
||||
[1][8].
|
||||
- **Request larger than spec max** (e.g. FC03 quantity 126) returns exception
|
||||
`03` (Illegal Data Value). Verified on S7-1200 V4.2 [1][24].
|
||||
- **CP 343-1 `MODBUSCP` per-request maxima are spec** (125/125/123/1968/2000),
|
||||
matching the standard [4]. The CP's `MODBUS_PARAM_CP` caps the total
|
||||
*exposed* range, not the per-call quantity.
|
||||
|
||||
Test names:
|
||||
`S7_1200_FC03_126_Registers_Returns_IllegalDataValue`,
|
||||
`S7_1200_FC16_124_Registers_Returns_IllegalDataValue`,
|
||||
`S7_1200_FC03_Past_MB_HOLD_REG_End_Returns_IllegalDataAddress`,
|
||||
`S7_1200_FC17_ReportServerId_Returns_IllegalFunction`.
|
||||
|
||||
## Exception Codes
|
||||
|
||||
S7 Modbus servers return only the four standard exception codes [1][4]:
|
||||
|
||||
| Code | Name | Triggered by |
|
||||
|------|-----------------------|----------------------------------------------------------------------|
|
||||
| 01 | Illegal Function | FC not in the supported list (17, 20-23, 43, any undefined FC) |
|
||||
| 02 | Illegal Data Address | Register outside `MB_HOLD_REG` / outside `MODBUSCP` param-block range |
|
||||
| 03 | Illegal Data Value | Quantity exceeds spec (FC03/04 > 125, FC16 > 123, FC01/02 > 2000, FC15 > 1968) |
|
||||
| 04 | Server Failure | Runtime error inside MB_SERVER (DB access fault, corrupt DB header, MB_SERVER disabled mid-request) [1][24] |
|
||||
|
||||
- **No proprietary exception codes (05/06/0A/0B) are used** on any S7
|
||||
Modbus server [1][4]. Our driver's status-code mapper can treat these as
|
||||
"never observed" on the S7 profile.
|
||||
- **CPU in STOP → `MB_SERVER` keeps running if it's in OB1 of the firmware's
|
||||
communication task, but OB1 itself is not scanned.** In practice:
|
||||
- Holding-register *reads* (FC03) continue to return the last DB values
|
||||
frozen at the moment the CPU entered STOP. The `MB_SERVER` block is in
|
||||
OB1 so it isn't re-invoked; however the TCP stack keeps the socket open
|
||||
and returns cached data on subsequent polls [1][21]. **Unconfirmed**
|
||||
whether this is cached in the CP or in the CPU's communication processor;
|
||||
behaviour varies between firmware 4.0 and 4.5 [21].
|
||||
- Holding-register *writes* (FC06/FC16) during STOP return exception `04`
|
||||
(Server Failure) on S7-1200 V4.2+, and return success-but-discarded on
|
||||
older firmware [1][24]. Our driver should treat FC06/FC16 during STOP as
|
||||
non-deterministic and not rely on the response code.
|
||||
- Coil *writes* (FC05/FC15) to `%Q` are *accepted* by the process image
|
||||
during STOP, but the physical output freezes at its last RUN-mode value
|
||||
(or the configured STOP-mode substitute value) until RUN resumes [1][21].
|
||||
- **Writing a read-only address via FC06/FC16**: returns `02` (Illegal Data
|
||||
Address), not `04`. S7 does not have "write-protected" holding registers —
|
||||
the programmer either exposes a DB for read-write or doesn't expose it at
|
||||
all [1][12].
|
||||
|
||||
STATUS codes (returned in the `STATUS` output of the block, not on the wire):
|
||||
|
||||
- `0x0000` — no error.
|
||||
- `0x7001` — first call, connection being established.
|
||||
- `0x7002` — subsequent cyclic call, connection in progress.
|
||||
- `0x8383` — data access error (optimized DB, DB too small, or type mismatch)
|
||||
[10][24].
|
||||
- `0x8188` — invalid parameter combination (e.g. MB_MODE out of range) [24].
|
||||
- `0x80C8` — mismatched UNIT_ID between MB_CLIENT and `MB_SERVER` [25].
|
||||
|
||||
Test names:
|
||||
`S7_1200_FC03_Outside_HoldReg_Returns_IllegalDataAddress`,
|
||||
`S7_1200_FC16_In_STOP_Returns_ServerFailure`,
|
||||
`S7_1200_FC03_In_STOP_Returns_Cached_Values`,
|
||||
`S7_1200_No_Proprietary_ExceptionCodes_0x05_0x06_0x0A_0x0B`.
|
||||
|
||||
## Connection Behavior
|
||||
|
||||
- **Max simultaneous Modbus TCP connections**:
|
||||
- **S7-1200**: shares a pool of 8 open-communication connections across
|
||||
all TCP/UDP/Modbus use. On a CPU 1211C you get 8 total; on 1215C/1217C
|
||||
still 8 shared among PG/HMI/OUC/Modbus. Each `MB_SERVER` instance
|
||||
reserves one. A typical site with a PG + 1 HMI + 2 Modbus clients uses
|
||||
4 of the 8 [1][26].
|
||||
- **S7-1500**: up to **8 concurrent Modbus TCP server connections** per
|
||||
`MB_SERVER` port, across multiple `MB_SERVER` instance DBs each with a
|
||||
unique port. Total open-communication resources depend on CPU (e.g.
|
||||
CPU 1515-2 PN supports 128 OUC connections total; Modbus is a subset)
|
||||
[1][27].
|
||||
- **CP 343-1 Lean**: up to **8** simultaneous Modbus TCP connections on
|
||||
port 502 [4][5]. Exceeding this refuses at TCP accept.
|
||||
- **CP 443-1 Advanced**: up to **16** simultaneous Modbus TCP connections
|
||||
[4].
|
||||
- **Multi-connection model on `MB_SERVER`**: one instance DB per connection.
|
||||
An instance DB listening on port 502 serves exactly one connection at a
|
||||
time; to serve N simultaneous clients you need N instance DBs each with a
|
||||
unique port (502/503/504...). **This is a real trap** — most users expect
|
||||
port 502 to multiplex [27][28]. Our driver must not assume port 502 is the
|
||||
only listener.
|
||||
- **Keep-alive**: S7-1500's TCP stack does send TCP keepalives (default
|
||||
every ~30 s) but the interval is not exposed as a configurable. S7-1200 is
|
||||
the same. CP 343-1 keepalives are configured via HW Config → CP properties
|
||||
→ Options → "Send keepalive" (default **off** on older firmware, default
|
||||
**on** on firmware V3.0+) [1][29]. Driver-side keepalive is still
|
||||
advisable for S7-300/CP 343-1 on old firmware.
|
||||
- **Idle-timeout close**: `MB_SERVER` does *not* close idle sockets on its
|
||||
own. However, the TCP stack on S7-1500 will close a socket that fails
|
||||
three consecutive keepalive probes (~2 minutes). Forum reports describe
|
||||
`MB_SERVER` connections "dying overnight" on S7-1500 when an HMI stops
|
||||
polling — the fix is to enable driver-side periodic reads or driver-side
|
||||
TCP keepalive [29][30].
|
||||
- **Reconnect after power cycle**: MB_SERVER starts listening ~1-2 seconds
|
||||
after the CPU reaches RUN. If the client reconnects during STARTUP OB
|
||||
(OB100), the connection is refused until OB1 runs the block at least once.
|
||||
Our driver should back off and retry on `ECONNREFUSED` for the first 5
|
||||
seconds after a power-cycle detection [1][24].
|
||||
- **Unit Identifier**: `MB_SERVER` accepts **any** Unit ID by default — there
|
||||
is no configurable filter; the PLC ignores the Unit ID field entirely.
|
||||
`MB_CLIENT` defaults to Unit ID = 255 as "ignore" [25][31]. Some
|
||||
third-party Modbus-TCP gateways *require* a specific Unit ID; sending
|
||||
anything to S7 is safe. **CP 343-1 `MODBUSCP`** also accepts any Unit ID
|
||||
in server mode, but the parameter DB exposes a `single_write` / `unit_id`
|
||||
field on newer firmware to allow filtering [4].
|
||||
|
||||
Test names:
|
||||
`S7_1200_9th_TCP_Connection_Refused_On_8_Conn_Pool`,
|
||||
`S7_1500_Port_503_Required_For_Second_Instance`,
|
||||
`S7_1200_Reconnect_After_Power_Cycle_Succeeds_Within_5s`,
|
||||
`S7_1200_Unit_ID_Ignored_Any_Accepted`.
|
||||
|
||||
## Behavioral Oddities
|
||||
|
||||
- **Transaction ID echo** is reliable on all S7 variants. `MB_SERVER` copies
|
||||
the MBAP TxId verbatim. No known firmware that drops TxId under load [1][31].
|
||||
- **Request serialization**: a single `MB_SERVER` instance serializes
|
||||
requests from its one connected client — the block processes one PDU per
|
||||
call and calls happen once per OB1 scan. OB1 scan time of 5-50 ms puts an
|
||||
upper bound on throughput at ~20-200 requests/sec per connection [1][30].
|
||||
Multiple `MB_SERVER` instances (one per port) run in parallel because OB1
|
||||
calls them sequentially within the same scan.
|
||||
- **OB1 scan coupling**: `MB_SERVER` must be called cyclically from OB1 (or
|
||||
another cyclic OB). If the programmer puts it in a conditional branch
|
||||
that doesn't fire every scan, requests time out. The STATUS `0x7002`
|
||||
"in progress" is *expected* between calls, not an error [1][24].
|
||||
- **Optimized DB backing `MB_HOLD_REG`** — already covered in Address
|
||||
Mapping; STATUS becomes `0x8383`. This is the most common deployment bug
|
||||
on S7-1500 projects migrated from older S7-1200 examples [10][11].
|
||||
- **CPU STOP behaviour** — covered in Exception Codes section. The short
|
||||
version: reads may return stale data without error; writes return exception
|
||||
04 on modern firmware.
|
||||
- **Partial-frame disconnect**: S7-1200/1500 TCP stack closes the socket on
|
||||
any MBAP header where the `Length` field doesn't match the PDU length.
|
||||
Driver must detect half-close and reconnect [1][29].
|
||||
- **MBAP `Protocol ID` must be 0**. Any non-zero value causes the CP/CPU to
|
||||
drop the frame silently (no response, no RST) on S7-1500 firmware V2.0
|
||||
through V2.9; firmware V3.0+ sends an RST [1][30]. *Unconfirmed* whether
|
||||
V3.1 still sends RST or returns to silent drop.
|
||||
- **FC01/FC02 access outside `%Q`/`%I` range**: on S7-1200, requesting
|
||||
coil address 8192 (= `%Q1024.0`) returns exception `02` (Illegal Data
|
||||
Address) [1][9]. The 8192-bit hard cap is a process-image size limit on
|
||||
the CPU, not a Modbus protocol limit.
|
||||
- **`MB_CLIENT` UNIT_ID mismatch with remote `MB_SERVER`** produces STATUS
|
||||
`0x80C8` on the client side, and the server silently discards the frame
|
||||
(no response on the wire) [25]. This matters for Modbus-TCP-to-RTU
|
||||
gateway scenarios where the Unit ID picks the RTU slave.
|
||||
- **Non-IEEE REAL / BCD**: S7 does *not* use BCD like DirectLOGIC. `Real` is
|
||||
always IEEE 754 single-precision. `LReal` (8-byte double) occupies 4
|
||||
Modbus registers in `ABCDEFGH` order (big-endian byte, big-endian word)
|
||||
[15][18].
|
||||
- **`MODBUSCP` single-write** on CP 343-1: a parameter `single_write` in the
|
||||
param DB controls whether FC06 on a register in the "holding register"
|
||||
area triggers a callback to the user program vs. updates the DB directly.
|
||||
Default is direct update. If a ladder programmer enables the callback
|
||||
without implementing the callback OB, FC06 writes hang for 5 seconds then
|
||||
return exception `04` [4].
|
||||
|
||||
Test names:
|
||||
`S7_1200_TxId_Preserved_Across_Burst_Of_50_Requests`,
|
||||
`S7_1200_MBSERVER_Throughput_Capped_By_OB1_Scan`,
|
||||
`S7_1200_MBAP_ProtocolID_NonZero_Frame_Dropped`,
|
||||
`S7_1200_Partial_MBAP_Causes_Half_Close`.
|
||||
|
||||
## Model-specific Differences Worth Separate Test Coverage
|
||||
|
||||
- **S7-1200 V4.0 vs V4.4+**: Older firmware does not support `WString` over
|
||||
`MB_HOLD_REG` and returns `0x8383` if the DB contains one [18][24]. Test
|
||||
both firmware bands separately.
|
||||
- **S7-1500 vs S7-1200**: S7-1500 supports multiple `MB_SERVER` instances on
|
||||
the *same* CPU with different ports cleanly; S7-1200 can too but its
|
||||
8-connection pool is shared tighter [1][27]. Throughput per-connection is
|
||||
~5× faster on S7-1500 because the comms task runs on a dedicated core.
|
||||
- **S7-300 + CP 343-1 vs S7-1200/1500**: parameter-block mapping (not
|
||||
`MB_HOLD_REG` pointer), per-connection license, no `%Q`/`%I` direct
|
||||
access for coils (everything goes through a DB), different STATUS codes
|
||||
(`DONE`/`ERROR`/`STATUS` word pairs vs. the single STATUS word) [4][14].
|
||||
Driver-side it's a different profile.
|
||||
- **CP 343-1 Lean vs CP 343-1 Advanced**: Lean is server-only; Advanced is
|
||||
client + server. Lean's max connections = 8; Advanced = 16 [4][5].
|
||||
- **CP 443-1 in S7-400H**: uses `MODBUSCP_REDUNDANT` which presents two
|
||||
Ethernet endpoints that fail over. Our driver's redundancy support should
|
||||
recognize the S7-400H profile as "two IP addresses, same server state,
|
||||
advertise via `ServerUriArray`" [6].
|
||||
- **ET 200SP CPU (1510SP / 1512SP)**: behaves as S7-1500 from `MB_SERVER`
|
||||
perspective. No known deltas [3].
|
||||
|
||||
## References
|
||||
|
||||
1. Siemens Industry Online Support, *Modbus/TCP Communication between SIMATIC S7-1500 / S7-1200 and Modbus/TCP Controllers with Instructions `MB_CLIENT` and `MB_SERVER`*, Entry ID 102020340, V6 (Feb 2021). https://cache.industry.siemens.com/dl/files/340/102020340/att_118119/v6/net_modbus_tcp_s7-1500_s7-1200_en.pdf
|
||||
2. Siemens TIA Portal Online Docs, *MB_SERVER instruction*. https://docs.tia.siemens.cloud/r/simatic_s7_1200_manual_collection_eses_20/communication-processor-and-modbus-tcp/modbus-communication/modbus-tcp/modbus-tcp-instructions/mb_server-communicate-using-profinet-as-modbus-tcp-server-instruction
|
||||
3. Siemens, *SIMATIC S7-1500 Communication Function Manual* (covers ET 200SP CPU). http://public.eandm.com/Public_Docs/s71500_communication_function_manual_en-US_en-US.pdf
|
||||
4. Siemens Industry Online Support, *SIMATIC Modbus/TCP communication using CP 343-1 and CP 443-1 — Programming Manual*, Entry ID 103447617. https://cache.industry.siemens.com/dl/files/617/103447617/att_106971/v1/simatic_modbus_tcp_cp_en-US_en-US.pdf
|
||||
5. Siemens Industry Online Support FAQ *"Which technical data applies for the SIMATIC Modbus/TCP software for CP 343-1 / CP 443-1?"*, Entry ID 104946406. https://www.industry-mobile-support.siemens-info.com/en/article/detail/104946406
|
||||
6. Siemens Industry Online Support, *Redundant Modbus/TCP communication via CP 443-1 in S7-400H systems*, Entry ID 109739212. https://cache.industry.siemens.com/dl/files/212/109739212/att_887886/v1/SIMATIC_modbus_tcp_cp_red_e_en-US.pdf
|
||||
7. Siemens Industry Online Support, *SIMATIC MODBUS (TCP) PN CPU Library — Programming and Operating Manual 06/2014*, Entry ID 75330636. https://support.industry.siemens.com/cs/attachments/75330636/ModbusTCPPNCPUen.pdf
|
||||
8. DMC Inc., *Using an S7-1200 PLC as a Modbus TCP Slave*. https://www.dmcinfo.com/blog/27313/using-an-s7-1200-plc-as-a-modbus-tcp-slave/
|
||||
9. Siemens, *SIMATIC S7-1200 System Manual* (V4.x), "MB_SERVER" pages 736-742. https://www.manualslib.com/manual/1453610/Siemens-S7-1200.html?page=736
|
||||
10. lamaPLC, *Simatic Modbus S7 error- and statuscodes*. https://www.lamaplc.com/doku.php?id=simatic:errorcodes
|
||||
11. ScadaProtocols, *How to Configure Modbus TCP on Siemens S7-1200 (TIA Portal Step-by-Step)*. https://scadaprotocols.com/modbus-tcp-siemens-s7-1200-tia-portal/
|
||||
12. Industrial Monitor Direct, *Reading and Writing Memory Bits via Modbus TCP on S7-1200*. https://industrialmonitordirect.com/blogs/knowledgebase/reading-and-writing-memory-bits-via-modbus-tcp-on-s7-1200
|
||||
13. PLCtalk forum *"Siemens S7-1200 modbus understanding"*. https://www.plctalk.net/forums/threads/siemens-s7-1200-modbus-understanding.104119/
|
||||
14. Siemens SIMATIC S7 Manual, "Function block MODBUSCP — Functionality" (ManualsLib p29). https://www.manualslib.com/manual/1580661/Siemens-Simatic-S7.html?page=29
|
||||
15. Chipkin, *How Real (Floating Point) and 32-bit Data is Encoded in Modbus*. https://store.chipkin.com/articles/how-real-floating-point-and-32-bit-data-is-encoded-in-modbus-rtu-messages
|
||||
16. Siemens Industry Online Support forum, *MODBUS DATA conversion in S7-1200 CPU*, Entry ID 97287. https://support.industry.siemens.com/forum/WW/en/posts/modbus-data-converson-in-s7-1200-cpu/97287
|
||||
17. Industrial Monitor Direct, *Siemens S7-1500 MB_SERVER Modbus TCP Configuration Guide*. https://industrialmonitordirect.com/de/blogs/knowledgebase/siemens-s7-1500-mb-server-modbus-tcp-configuration-guide
|
||||
18. Siemens TIA Portal, *Data types in SIMATIC S7-1200/1500 — String/WString header layout* (system manual, "Elementary Data Types").
|
||||
19. Kepware / PTC, *Siemens TCP/IP Ethernet Driver Help*, "Byte / Word Order" tag property. https://www.opcturkey.com/uploads/siemens-tcp-ip-ethernet-manual.pdf
|
||||
20. Siemens SiePortal forum, *Transfer float out of words*, Entry ID 187811. https://sieportal.siemens.com/en-ww/support/forum/posts/transfer-float-out-of-words/187811 _(operator-reported "S7 swaps float" claim — traced to remote-device issue; **unconfirmed**.)_
|
||||
21. Siemens SiePortal forum, *S7-1200 communication with Modbus TCP*, Entry ID 133086. https://support.industry.siemens.com/forum/WW/en/posts/s7-1200-communication-with-modbus-tcp/133086
|
||||
22. Siemens SiePortal forum, *S7-1500 MB Server Holding Register Max Word*, Entry ID 224636. https://support.industry.siemens.com/forum/WW/en/posts/s7-1500-mb-server-holding-register-max-word/224636
|
||||
23. Siemens, *SIMATIC S7-1500 Technical Specifications* — CPU-specific DB size limits in each CPU manual's "Memory" table.
|
||||
24. Siemens TIA Portal Online Docs, *Error messages (S7-1200, S7-1500) — Modbus instructions*. https://docs.tia.siemens.cloud/r/en-us/v20/modbus-rtu-s7-1200-s7-1500/error-messages-s7-1200-s7-1500
|
||||
25. Industrial Monitor Direct, *Fix Siemens S7-1500 MB_Client UnitID Error 80C8*. https://industrialmonitordirect.com/blogs/knowledgebase/troubleshooting-mb-client-on-s7-1500-cpu-1515sp-modbus-tcp
|
||||
26. Siemens SiePortal forum, *How many TCP connections can the S7-1200 make?*, Entry ID 275570. https://support.industry.siemens.com/forum/WW/en/posts/how-many-tcp-connections-can-the-s7-1200-make/275570
|
||||
27. Siemens SiePortal forum, *Simultaneous connections of Modbus TCP*, Entry ID 189626. https://support.industry.siemens.com/forum/ww/en/posts/simultaneous-connections-of-modbus-tcp/189626
|
||||
28. Siemens SiePortal forum, *How many Modbus TCP IP clients can read simultaneously from S7-1517*, Entry ID 261569. https://support.industry.siemens.com/forum/WW/en/posts/how-many-modbus-tcp-ip-client-can-read-simultaneously-in-s7-1517/261569
|
||||
29. Industrial Monitor Direct, *Troubleshooting Intermittent Modbus TCP Connections on S7-1500 PLC*. https://industrialmonitordirect.com/blogs/knowledgebase/troubleshooting-intermittent-modbus-tcp-connections-on-s7-1500-plc
|
||||
30. PLCtalk forum *"S7-1500 modbus tcp speed?"*. https://www.plctalk.net/forums/threads/s7-1500-modbus-tcp-speed.114046/
|
||||
31. Siemens SiePortal forum, *MB_Unit_ID parameter in Modbus TCP*, Entry ID 156635. https://support.industry.siemens.com/forum/WW/en/posts/mb-unit-id-parameter-in-modbus-tcp/156635
|
||||
79
scripts/compliance/phase-6-1-compliance.ps1
Normal file
79
scripts/compliance/phase-6-1-compliance.ps1
Normal file
@@ -0,0 +1,79 @@
|
||||
<#
|
||||
.SYNOPSIS
|
||||
Phase 6.1 exit-gate compliance check — stub. Each `Assert-*` either passes
|
||||
(Write-Host green) or throws. Non-zero exit = fail.
|
||||
|
||||
.DESCRIPTION
|
||||
Validates Phase 6.1 (Resilience & Observability runtime) completion. Checks
|
||||
enumerated in `docs/v2/implementation/phase-6-1-resilience-and-observability.md`
|
||||
§"Compliance Checks (run at exit gate)".
|
||||
|
||||
Current status: SCAFFOLD. Every check writes a TODO line and does NOT throw.
|
||||
Each implementation task in Phase 6.1 is responsible for replacing its TODO
|
||||
with a real check before closing that task.
|
||||
|
||||
.NOTES
|
||||
Usage: pwsh ./scripts/compliance/phase-6-1-compliance.ps1
|
||||
Exit: 0 = all checks passed (or are still TODO); non-zero = explicit fail
|
||||
#>
|
||||
[CmdletBinding()]
|
||||
param()
|
||||
|
||||
$ErrorActionPreference = 'Stop'
|
||||
$script:failures = 0
|
||||
|
||||
function Assert-Todo {
|
||||
param([string]$Check, [string]$ImplementationTask)
|
||||
Write-Host " [TODO] $Check (implement during $ImplementationTask)" -ForegroundColor Yellow
|
||||
}
|
||||
|
||||
function Assert-Pass {
|
||||
param([string]$Check)
|
||||
Write-Host " [PASS] $Check" -ForegroundColor Green
|
||||
}
|
||||
|
||||
function Assert-Fail {
|
||||
param([string]$Check, [string]$Reason)
|
||||
Write-Host " [FAIL] $Check — $Reason" -ForegroundColor Red
|
||||
$script:failures++
|
||||
}
|
||||
|
||||
Write-Host ""
|
||||
Write-Host "=== Phase 6.1 compliance — Resilience & Observability runtime ===" -ForegroundColor Cyan
|
||||
Write-Host ""
|
||||
|
||||
Write-Host "Stream A — Resilience layer"
|
||||
Assert-Todo "Invoker coverage — every capability-interface method routes through CapabilityInvoker (analyzer error-level)" "Stream A.3"
|
||||
Assert-Todo "Write-retry guard — writes without [WriteIdempotent] never retry" "Stream A.5"
|
||||
Assert-Todo "Pipeline isolation — `(DriverInstanceId, HostName)` key; one dead host does not open breaker for siblings" "Stream A.5"
|
||||
|
||||
Write-Host ""
|
||||
Write-Host "Stream B — Tier A/B/C runtime"
|
||||
Assert-Todo "Tier registry — every driver type has non-null Tier; Tier C declares out-of-process topology" "Stream B.1"
|
||||
Assert-Todo "MemoryTracking never kills — soft/hard breach on Tier A/B logs + surfaces without terminating" "Stream B.6"
|
||||
Assert-Todo "MemoryRecycle Tier C only — hard breach on Tier A never invokes supervisor; Tier C does" "Stream B.6"
|
||||
Assert-Todo "Wedge demand-aware — idle/historic-backfill/write-only cases stay Healthy" "Stream B.6"
|
||||
Assert-Todo "Galaxy supervisor preserved — Driver.Galaxy.Proxy/Supervisor/CircuitBreaker + Backoff still present + invoked" "Stream A.4"
|
||||
|
||||
Write-Host ""
|
||||
Write-Host "Stream C — Health + logging"
|
||||
Assert-Todo "Health state machine — /healthz + /readyz respond < 500 ms for every DriverState per matrix in plan" "Stream C.4"
|
||||
Assert-Todo "Structured log — CI grep asserts DriverInstanceId + CorrelationId JSON fields present" "Stream C.4"
|
||||
|
||||
Write-Host ""
|
||||
Write-Host "Stream D — LiteDB cache"
|
||||
Assert-Todo "Generation-sealed snapshot — SQL kill mid-op serves last-sealed snapshot; UsingStaleConfig=true" "Stream D.4"
|
||||
Assert-Todo "Mixed-generation guard — corruption of snapshot file fails closed; no mixed reads" "Stream D.4"
|
||||
Assert-Todo "First-boot no-snapshot + DB-down — InitializeAsync fails with clear error" "Stream D.4"
|
||||
|
||||
Write-Host ""
|
||||
Write-Host "Cross-cutting"
|
||||
Assert-Todo "No test-count regression — dotnet test ZB.MOM.WW.OtOpcUa.slnx count ≥ pre-Phase-6.1 baseline" "Final exit-gate"
|
||||
|
||||
Write-Host ""
|
||||
if ($script:failures -eq 0) {
|
||||
Write-Host "Phase 6.1 compliance: scaffold-mode PASS (all checks TODO)" -ForegroundColor Green
|
||||
exit 0
|
||||
}
|
||||
Write-Host "Phase 6.1 compliance: $script:failures FAIL(s)" -ForegroundColor Red
|
||||
exit 1
|
||||
81
scripts/compliance/phase-6-2-compliance.ps1
Normal file
81
scripts/compliance/phase-6-2-compliance.ps1
Normal file
@@ -0,0 +1,81 @@
|
||||
<#
|
||||
.SYNOPSIS
|
||||
Phase 6.2 exit-gate compliance check — stub. Each `Assert-*` either passes
|
||||
(Write-Host green) or throws. Non-zero exit = fail.
|
||||
|
||||
.DESCRIPTION
|
||||
Validates Phase 6.2 (Authorization runtime) completion. Checks enumerated
|
||||
in `docs/v2/implementation/phase-6-2-authorization-runtime.md`
|
||||
§"Compliance Checks (run at exit gate)".
|
||||
|
||||
Current status: SCAFFOLD. Every check writes a TODO line and does NOT throw.
|
||||
Each implementation task in Phase 6.2 is responsible for replacing its TODO
|
||||
with a real check before closing that task.
|
||||
|
||||
.NOTES
|
||||
Usage: pwsh ./scripts/compliance/phase-6-2-compliance.ps1
|
||||
Exit: 0 = all checks passed (or are still TODO); non-zero = explicit fail
|
||||
#>
|
||||
[CmdletBinding()]
|
||||
param()
|
||||
|
||||
$ErrorActionPreference = 'Stop'
|
||||
$script:failures = 0
|
||||
|
||||
function Assert-Todo {
|
||||
param([string]$Check, [string]$ImplementationTask)
|
||||
Write-Host " [TODO] $Check (implement during $ImplementationTask)" -ForegroundColor Yellow
|
||||
}
|
||||
|
||||
function Assert-Pass {
|
||||
param([string]$Check)
|
||||
Write-Host " [PASS] $Check" -ForegroundColor Green
|
||||
}
|
||||
|
||||
function Assert-Fail {
|
||||
param([string]$Check, [string]$Reason)
|
||||
Write-Host " [FAIL] $Check — $Reason" -ForegroundColor Red
|
||||
$script:failures++
|
||||
}
|
||||
|
||||
Write-Host ""
|
||||
Write-Host "=== Phase 6.2 compliance — Authorization runtime ===" -ForegroundColor Cyan
|
||||
Write-Host ""
|
||||
|
||||
Write-Host "Stream A — LdapGroupRoleMapping (control plane)"
|
||||
Assert-Todo "Control/data-plane separation — Core.Authorization has zero refs to LdapGroupRoleMapping" "Stream A.2"
|
||||
Assert-Todo "Authoring validation — AclsTab rejects duplicate (LdapGroup, Scope) pre-save" "Stream A.3"
|
||||
|
||||
Write-Host ""
|
||||
Write-Host "Stream B — Evaluator + trie + cache"
|
||||
Assert-Todo "Trie invariants — PermissionTrieBuilder idempotent (build twice == equal)" "Stream B.1"
|
||||
Assert-Todo "Additive grants + cluster isolation — cross-cluster leakage impossible" "Stream B.1"
|
||||
Assert-Todo "Galaxy FolderSegment coverage — folder-subtree grant cascades; siblings unaffected" "Stream B.2"
|
||||
Assert-Todo "Redundancy-safe invalidation — generation-mismatch forces trie re-load on peer" "Stream B.4"
|
||||
Assert-Todo "Membership freshness — 15 min interval elapsed + LDAP down = fail-closed" "Stream B.5"
|
||||
Assert-Todo "Auth cache fail-closed — 5 min AuthCacheMaxStaleness exceeded = NotGranted" "Stream B.5"
|
||||
Assert-Todo "AuthorizationDecision shape — Allow + NotGranted only; Denied variant exists unused" "Stream B.6"
|
||||
|
||||
Write-Host ""
|
||||
Write-Host "Stream C — OPC UA operation wiring"
|
||||
Assert-Todo "Every operation wired — Browse/Read/Write/HistoryRead/HistoryUpdate/CreateMonitoredItems/TransferSubscriptions/Call/Ack/Confirm/Shelve" "Stream C.1-C.7"
|
||||
Assert-Todo "HistoryRead uses its own flag — Read+no-HistoryRead denies HistoryRead" "Stream C.3"
|
||||
Assert-Todo "Mixed-batch semantics — 3 allowed + 2 denied returns per-item status, no coarse failure" "Stream C.6"
|
||||
Assert-Todo "Browse ancestor visibility — deep grant implies ancestor browse; denied ancestors filter" "Stream C.7"
|
||||
Assert-Todo "Subscription re-authorization — revoked grant surfaces BadUserAccessDenied in one publish" "Stream C.5"
|
||||
|
||||
Write-Host ""
|
||||
Write-Host "Stream D — Admin UI + SignalR invalidation"
|
||||
Assert-Todo "SignalR invalidation — sp_PublishGeneration pushes PermissionTrieCache invalidate < 2 s" "Stream D.4"
|
||||
|
||||
Write-Host ""
|
||||
Write-Host "Cross-cutting"
|
||||
Assert-Todo "No test-count regression — dotnet test ZB.MOM.WW.OtOpcUa.slnx count ≥ pre-Phase-6.2 baseline" "Final exit-gate"
|
||||
|
||||
Write-Host ""
|
||||
if ($script:failures -eq 0) {
|
||||
Write-Host "Phase 6.2 compliance: scaffold-mode PASS (all checks TODO)" -ForegroundColor Green
|
||||
exit 0
|
||||
}
|
||||
Write-Host "Phase 6.2 compliance: $script:failures FAIL(s)" -ForegroundColor Red
|
||||
exit 1
|
||||
85
scripts/compliance/phase-6-3-compliance.ps1
Normal file
85
scripts/compliance/phase-6-3-compliance.ps1
Normal file
@@ -0,0 +1,85 @@
|
||||
<#
|
||||
.SYNOPSIS
|
||||
Phase 6.3 exit-gate compliance check — stub. Each `Assert-*` either passes
|
||||
(Write-Host green) or throws. Non-zero exit = fail.
|
||||
|
||||
.DESCRIPTION
|
||||
Validates Phase 6.3 (Redundancy runtime) completion. Checks enumerated in
|
||||
`docs/v2/implementation/phase-6-3-redundancy-runtime.md`
|
||||
§"Compliance Checks (run at exit gate)".
|
||||
|
||||
Current status: SCAFFOLD. Every check writes a TODO line and does NOT throw.
|
||||
Each implementation task in Phase 6.3 is responsible for replacing its TODO
|
||||
with a real check before closing that task.
|
||||
|
||||
.NOTES
|
||||
Usage: pwsh ./scripts/compliance/phase-6-3-compliance.ps1
|
||||
Exit: 0 = all checks passed (or are still TODO); non-zero = explicit fail
|
||||
#>
|
||||
[CmdletBinding()]
|
||||
param()
|
||||
|
||||
$ErrorActionPreference = 'Stop'
|
||||
$script:failures = 0
|
||||
|
||||
function Assert-Todo {
|
||||
param([string]$Check, [string]$ImplementationTask)
|
||||
Write-Host " [TODO] $Check (implement during $ImplementationTask)" -ForegroundColor Yellow
|
||||
}
|
||||
|
||||
function Assert-Pass {
|
||||
param([string]$Check)
|
||||
Write-Host " [PASS] $Check" -ForegroundColor Green
|
||||
}
|
||||
|
||||
function Assert-Fail {
|
||||
param([string]$Check, [string]$Reason)
|
||||
Write-Host " [FAIL] $Check — $Reason" -ForegroundColor Red
|
||||
$script:failures++
|
||||
}
|
||||
|
||||
Write-Host ""
|
||||
Write-Host "=== Phase 6.3 compliance — Redundancy runtime ===" -ForegroundColor Cyan
|
||||
Write-Host ""
|
||||
|
||||
Write-Host "Stream A — Topology loader"
|
||||
Assert-Todo "Transparent-mode rejection — sp_PublishGeneration blocks RedundancyMode=Transparent" "Stream A.3"
|
||||
|
||||
Write-Host ""
|
||||
Write-Host "Stream B — Peer probe + ServiceLevel calculator"
|
||||
Assert-Todo "OPC UA band compliance — 0=Maintenance / 1=NoData reserved; operational 2..255" "Stream B.2"
|
||||
Assert-Todo "Authoritative-Primary ServiceLevel = 255" "Stream B.2"
|
||||
Assert-Todo "Isolated-Primary (peer unreachable, self serving) = 230" "Stream B.2"
|
||||
Assert-Todo "Primary-Mid-Apply = 200" "Stream B.2"
|
||||
Assert-Todo "Recovering-Primary = 180 with dwell + publish witness enforced" "Stream B.2"
|
||||
Assert-Todo "Authoritative-Backup = 100" "Stream B.2"
|
||||
Assert-Todo "Isolated-Backup (primary unreachable) = 80 — no auto-promote" "Stream B.2"
|
||||
Assert-Todo "InvalidTopology = 2 — >1 Primary self-demotes both nodes" "Stream B.2"
|
||||
Assert-Todo "UaHealthProbe authority — HTTP-200 + UA-down peer treated as UA-unhealthy" "Stream B.1"
|
||||
|
||||
Write-Host ""
|
||||
Write-Host "Stream C — OPC UA node wiring"
|
||||
Assert-Todo "ServerUriArray — returns self + peer URIs, self first" "Stream C.2"
|
||||
Assert-Todo "Client.CLI cutover — primary halt triggers reconnect to backup via ServerUriArray" "Stream C.4"
|
||||
|
||||
Write-Host ""
|
||||
Write-Host "Stream D — Apply-lease + publish fencing"
|
||||
Assert-Todo "Apply-lease disposal — leases close on exception, cancellation, watchdog timeout" "Stream D.2"
|
||||
Assert-Todo "Role transition via operator publish — no restart; both nodes flip ServiceLevel on publish confirm" "Stream D.3"
|
||||
|
||||
Write-Host ""
|
||||
Write-Host "Stream F — Interop matrix"
|
||||
Assert-Todo "Client interoperability matrix — Ignition 8.1/8.3 / Kepware / Aveva OI Gateway findings documented" "Stream F.1-F.2"
|
||||
Assert-Todo "Galaxy MXAccess failover — primary kill; Galaxy consumer reconnects within session-timeout budget" "Stream F.3"
|
||||
|
||||
Write-Host ""
|
||||
Write-Host "Cross-cutting"
|
||||
Assert-Todo "No regression in driver test suites; /healthz reachable under redundancy load" "Final exit-gate"
|
||||
|
||||
Write-Host ""
|
||||
if ($script:failures -eq 0) {
|
||||
Write-Host "Phase 6.3 compliance: scaffold-mode PASS (all checks TODO)" -ForegroundColor Green
|
||||
exit 0
|
||||
}
|
||||
Write-Host "Phase 6.3 compliance: $script:failures FAIL(s)" -ForegroundColor Red
|
||||
exit 1
|
||||
83
scripts/compliance/phase-6-4-compliance.ps1
Normal file
83
scripts/compliance/phase-6-4-compliance.ps1
Normal file
@@ -0,0 +1,83 @@
|
||||
<#
|
||||
.SYNOPSIS
|
||||
Phase 6.4 exit-gate compliance check — stub. Each `Assert-*` either passes
|
||||
(Write-Host green) or throws. Non-zero exit = fail.
|
||||
|
||||
.DESCRIPTION
|
||||
Validates Phase 6.4 (Admin UI completion) completion. Checks enumerated in
|
||||
`docs/v2/implementation/phase-6-4-admin-ui-completion.md`
|
||||
§"Compliance Checks (run at exit gate)".
|
||||
|
||||
Current status: SCAFFOLD. Every check writes a TODO line and does NOT throw.
|
||||
Each implementation task in Phase 6.4 is responsible for replacing its TODO
|
||||
with a real check before closing that task.
|
||||
|
||||
.NOTES
|
||||
Usage: pwsh ./scripts/compliance/phase-6-4-compliance.ps1
|
||||
Exit: 0 = all checks passed (or are still TODO); non-zero = explicit fail
|
||||
#>
|
||||
[CmdletBinding()]
|
||||
param()
|
||||
|
||||
$ErrorActionPreference = 'Stop'
|
||||
$script:failures = 0
|
||||
|
||||
function Assert-Todo {
|
||||
param([string]$Check, [string]$ImplementationTask)
|
||||
Write-Host " [TODO] $Check (implement during $ImplementationTask)" -ForegroundColor Yellow
|
||||
}
|
||||
|
||||
function Assert-Pass {
|
||||
param([string]$Check)
|
||||
Write-Host " [PASS] $Check" -ForegroundColor Green
|
||||
}
|
||||
|
||||
function Assert-Fail {
|
||||
param([string]$Check, [string]$Reason)
|
||||
Write-Host " [FAIL] $Check — $Reason" -ForegroundColor Red
|
||||
$script:failures++
|
||||
}
|
||||
|
||||
Write-Host ""
|
||||
Write-Host "=== Phase 6.4 compliance — Admin UI completion ===" -ForegroundColor Cyan
|
||||
Write-Host ""
|
||||
|
||||
Write-Host "Stream A — UNS drag/move + impact preview"
|
||||
Assert-Todo "UNS drag/move — drag line across areas; modal shows correct impacted-equipment + tag counts" "Stream A.2"
|
||||
Assert-Todo "Concurrent-edit safety — session B saves draft mid-preview; session A Confirm returns 409" "Stream A.3 (DraftRevisionToken)"
|
||||
Assert-Todo "Cross-cluster drop disabled — actionable toast points to Export/Import" "Stream A.2"
|
||||
Assert-Todo "1000-node tree — drag-enter feedback < 100 ms" "Stream A.4"
|
||||
|
||||
Write-Host ""
|
||||
Write-Host "Stream B — CSV import + staged-import + 5-identifier search"
|
||||
Assert-Todo "CSV header version — file missing '# OtOpcUaCsv v1' rejected pre-parse" "Stream B.1"
|
||||
Assert-Todo "CSV canonical identifier set — columns match decision #117 exactly" "Stream B.1"
|
||||
Assert-Todo "Staged-import atomicity — 10k-row FinaliseImportBatch < 30 s; user-scoped visibility; DropImportBatch rollback" "Stream B.3"
|
||||
Assert-Todo "Concurrent import + external reservation — finalize retries with conflict handling; no corruption" "Stream B.3"
|
||||
Assert-Todo "5-identifier search ranking — exact > prefix; published > draft for equal scores" "Stream B.4"
|
||||
|
||||
Write-Host ""
|
||||
Write-Host "Stream C — DiffViewer sections"
|
||||
Assert-Todo "Diff viewer section caps — 2000-row subtree-rename summary-only; 'Load full diff' paginates" "Stream C.2"
|
||||
|
||||
Write-Host ""
|
||||
Write-Host "Stream D — Identification (OPC 40010)"
|
||||
Assert-Todo "OPC 40010 field list match — rendered fields match decision #139 exactly; no extras" "Stream D.1"
|
||||
Assert-Todo "OPC 40010 exposure — Identification sub-folder shows when non-null; absent when all null" "Stream D.3"
|
||||
Assert-Todo "ACL inheritance for Identification — Equipment-grant reads; no-grant denies both" "Stream D.4"
|
||||
|
||||
Write-Host ""
|
||||
Write-Host "Visual compliance"
|
||||
Assert-Todo "Visual parity reviewer — FleetAdmin signoff vs admin-ui.md §Visual-Design; screenshot set checked in under docs/v2/visual-compliance/phase-6-4/" "Visual review"
|
||||
|
||||
Write-Host ""
|
||||
Write-Host "Cross-cutting"
|
||||
Assert-Todo "Full solution dotnet test passes; no test-count regression vs pre-Phase-6.4 baseline" "Final exit-gate"
|
||||
|
||||
Write-Host ""
|
||||
if ($script:failures -eq 0) {
|
||||
Write-Host "Phase 6.4 compliance: scaffold-mode PASS (all checks TODO)" -ForegroundColor Green
|
||||
exit 0
|
||||
}
|
||||
Write-Host "Phase 6.4 compliance: $script:failures FAIL(s)" -ForegroundColor Red
|
||||
exit 1
|
||||
@@ -5,15 +5,18 @@
|
||||
<h5 class="mb-4">OtOpcUa Admin</h5>
|
||||
<ul class="nav flex-column">
|
||||
<li class="nav-item"><a class="nav-link text-light" href="/">Overview</a></li>
|
||||
<li class="nav-item"><a class="nav-link text-light" href="/fleet">Fleet status</a></li>
|
||||
<li class="nav-item"><a class="nav-link text-light" href="/hosts">Host status</a></li>
|
||||
<li class="nav-item"><a class="nav-link text-light" href="/clusters">Clusters</a></li>
|
||||
<li class="nav-item"><a class="nav-link text-light" href="/reservations">Reservations</a></li>
|
||||
<li class="nav-item"><a class="nav-link text-light" href="/certificates">Certificates</a></li>
|
||||
</ul>
|
||||
|
||||
<div class="mt-5">
|
||||
<AuthorizeView>
|
||||
<Authorized>
|
||||
<div class="small text-light">
|
||||
Signed in as <strong>@context.User.Identity?.Name</strong>
|
||||
Signed in as <a class="text-light" href="/account"><strong>@context.User.Identity?.Name</strong></a>
|
||||
</div>
|
||||
<div class="small text-muted">
|
||||
@string.Join(", ", context.User.Claims.Where(c => c.Type.EndsWith("/role")).Select(c => c.Value))
|
||||
|
||||
129
src/ZB.MOM.WW.OtOpcUa.Admin/Components/Pages/Account.razor
Normal file
129
src/ZB.MOM.WW.OtOpcUa.Admin/Components/Pages/Account.razor
Normal file
@@ -0,0 +1,129 @@
|
||||
@page "/account"
|
||||
@attribute [Microsoft.AspNetCore.Authorization.Authorize]
|
||||
@using System.Security.Claims
|
||||
@using ZB.MOM.WW.OtOpcUa.Admin.Services
|
||||
|
||||
<h1 class="mb-4">My account</h1>
|
||||
|
||||
<AuthorizeView>
|
||||
<Authorized>
|
||||
@{
|
||||
var username = context.User.FindFirst(ClaimTypes.NameIdentifier)?.Value ?? "—";
|
||||
var displayName = context.User.Identity?.Name ?? "—";
|
||||
var roles = context.User.Claims
|
||||
.Where(c => c.Type == ClaimTypes.Role).Select(c => c.Value).ToList();
|
||||
var ldapGroups = context.User.Claims
|
||||
.Where(c => c.Type == "ldap_group").Select(c => c.Value).ToList();
|
||||
}
|
||||
|
||||
<div class="row g-4">
|
||||
<div class="col-md-6">
|
||||
<div class="card">
|
||||
<div class="card-body">
|
||||
<h5 class="card-title">Identity</h5>
|
||||
<dl class="row mb-0">
|
||||
<dt class="col-sm-4">Username</dt><dd class="col-sm-8"><code>@username</code></dd>
|
||||
<dt class="col-sm-4">Display name</dt><dd class="col-sm-8">@displayName</dd>
|
||||
</dl>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="col-md-6">
|
||||
<div class="card">
|
||||
<div class="card-body">
|
||||
<h5 class="card-title">Admin roles</h5>
|
||||
@if (roles.Count == 0)
|
||||
{
|
||||
<p class="text-muted mb-0">No Admin roles mapped — sign-in would have been blocked, so if you're seeing this, the session claim is likely stale.</p>
|
||||
}
|
||||
else
|
||||
{
|
||||
<div class="mb-2">
|
||||
@foreach (var r in roles)
|
||||
{
|
||||
<span class="badge bg-primary me-1">@r</span>
|
||||
}
|
||||
</div>
|
||||
<small class="text-muted">LDAP groups: @(ldapGroups.Count == 0 ? "(none surfaced)" : string.Join(", ", ldapGroups))</small>
|
||||
}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="col-12">
|
||||
<div class="card">
|
||||
<div class="card-body">
|
||||
<h5 class="card-title">Capabilities</h5>
|
||||
<p class="text-muted small">
|
||||
Each Admin role grants a fixed capability set per <code>admin-ui.md</code> §Admin Roles.
|
||||
Pages below reflect what this session can access; the route's <code>[Authorize]</code> guard
|
||||
is the ground truth — this table mirrors it for readability.
|
||||
</p>
|
||||
<table class="table table-sm align-middle mb-0">
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Capability</th>
|
||||
<th>Required role(s)</th>
|
||||
<th class="text-end">You have it?</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
@foreach (var cap in Capabilities)
|
||||
{
|
||||
var has = cap.RequiredRoles.Any(r => roles.Contains(r, StringComparer.OrdinalIgnoreCase));
|
||||
<tr>
|
||||
<td>@cap.Name<br /><small class="text-muted">@cap.Description</small></td>
|
||||
<td>@string.Join(" or ", cap.RequiredRoles)</td>
|
||||
<td class="text-end">
|
||||
@if (has)
|
||||
{
|
||||
<span class="badge bg-success">Yes</span>
|
||||
}
|
||||
else
|
||||
{
|
||||
<span class="badge bg-secondary">No</span>
|
||||
}
|
||||
</td>
|
||||
</tr>
|
||||
}
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="mt-4">
|
||||
<form method="post" action="/auth/logout">
|
||||
<button class="btn btn-outline-danger" type="submit">Sign out</button>
|
||||
</form>
|
||||
</div>
|
||||
</Authorized>
|
||||
</AuthorizeView>
|
||||
|
||||
@code {
|
||||
private sealed record Capability(string Name, string Description, string[] RequiredRoles);
|
||||
|
||||
// Kept in sync with Program.cs authorization policies + each page's [Authorize] attribute.
|
||||
// When a new page or policy is added, extend this list so operators can self-service check
|
||||
// whether their session has access without trial-and-error navigation.
|
||||
private static readonly IReadOnlyList<Capability> Capabilities =
|
||||
[
|
||||
new("View clusters + fleet status",
|
||||
"Read-only access to the cluster list, fleet dashboard, and generation history.",
|
||||
[AdminRoles.ConfigViewer, AdminRoles.ConfigEditor, AdminRoles.FleetAdmin]),
|
||||
new("Edit configuration drafts",
|
||||
"Create and edit draft generations, manage namespace bindings and node ACLs. CanEdit policy.",
|
||||
[AdminRoles.ConfigEditor, AdminRoles.FleetAdmin]),
|
||||
new("Publish generations",
|
||||
"Promote a draft to Published — triggers node roll-out. CanPublish policy.",
|
||||
[AdminRoles.FleetAdmin]),
|
||||
new("Manage certificate trust",
|
||||
"Trust rejected client certs + revoke trust. FleetAdmin-only because the trust decision gates OPC UA client access.",
|
||||
[AdminRoles.FleetAdmin]),
|
||||
new("Manage external-ID reservations",
|
||||
"Reserve / release external IDs that map into Galaxy contained names.",
|
||||
[AdminRoles.ConfigEditor, AdminRoles.FleetAdmin]),
|
||||
];
|
||||
}
|
||||
154
src/ZB.MOM.WW.OtOpcUa.Admin/Components/Pages/Certificates.razor
Normal file
154
src/ZB.MOM.WW.OtOpcUa.Admin/Components/Pages/Certificates.razor
Normal file
@@ -0,0 +1,154 @@
|
||||
@page "/certificates"
|
||||
@attribute [Microsoft.AspNetCore.Authorization.Authorize(Roles = AdminRoles.FleetAdmin)]
|
||||
@using ZB.MOM.WW.OtOpcUa.Admin.Services
|
||||
@inject CertTrustService Certs
|
||||
@inject AuthenticationStateProvider AuthState
|
||||
@inject ILogger<Certificates> Log
|
||||
|
||||
<h1 class="mb-4">Certificate trust</h1>
|
||||
|
||||
<div class="alert alert-info small mb-4">
|
||||
PKI store root <code>@Certs.PkiStoreRoot</code>. Trusting a rejected cert moves the file into the trusted store — the OPC UA server picks up the change on the next client handshake, so operators should retry the rejected client's connection after trusting.
|
||||
</div>
|
||||
|
||||
@if (_status is not null)
|
||||
{
|
||||
<div class="alert alert-@_statusKind alert-dismissible">
|
||||
@_status
|
||||
<button type="button" class="btn-close" @onclick="ClearStatus"></button>
|
||||
</div>
|
||||
}
|
||||
|
||||
<h2 class="h4">Rejected (@_rejected.Count)</h2>
|
||||
@if (_rejected.Count == 0)
|
||||
{
|
||||
<p class="text-muted">No rejected certificates. Clients that fail to handshake with an untrusted cert land here.</p>
|
||||
}
|
||||
else
|
||||
{
|
||||
<table class="table table-sm align-middle">
|
||||
<thead><tr><th>Subject</th><th>Issuer</th><th>Thumbprint</th><th>Valid</th><th class="text-end">Actions</th></tr></thead>
|
||||
<tbody>
|
||||
@foreach (var c in _rejected)
|
||||
{
|
||||
<tr>
|
||||
<td>@c.Subject</td>
|
||||
<td>@c.Issuer</td>
|
||||
<td><code class="small">@c.Thumbprint</code></td>
|
||||
<td class="small">@c.NotBefore.ToString("yyyy-MM-dd") → @c.NotAfter.ToString("yyyy-MM-dd")</td>
|
||||
<td class="text-end">
|
||||
<button class="btn btn-sm btn-success me-1" @onclick="() => TrustAsync(c)">Trust</button>
|
||||
<button class="btn btn-sm btn-outline-danger" @onclick="() => DeleteRejectedAsync(c)">Delete</button>
|
||||
</td>
|
||||
</tr>
|
||||
}
|
||||
</tbody>
|
||||
</table>
|
||||
}
|
||||
|
||||
<h2 class="h4 mt-5">Trusted (@_trusted.Count)</h2>
|
||||
@if (_trusted.Count == 0)
|
||||
{
|
||||
<p class="text-muted">No client certs have been explicitly trusted. The server's own application cert lives in <code>own/</code> and is not listed here.</p>
|
||||
}
|
||||
else
|
||||
{
|
||||
<table class="table table-sm align-middle">
|
||||
<thead><tr><th>Subject</th><th>Issuer</th><th>Thumbprint</th><th>Valid</th><th class="text-end">Actions</th></tr></thead>
|
||||
<tbody>
|
||||
@foreach (var c in _trusted)
|
||||
{
|
||||
<tr>
|
||||
<td>@c.Subject</td>
|
||||
<td>@c.Issuer</td>
|
||||
<td><code class="small">@c.Thumbprint</code></td>
|
||||
<td class="small">@c.NotBefore.ToString("yyyy-MM-dd") → @c.NotAfter.ToString("yyyy-MM-dd")</td>
|
||||
<td class="text-end">
|
||||
<button class="btn btn-sm btn-outline-danger" @onclick="() => UntrustAsync(c)">Revoke</button>
|
||||
</td>
|
||||
</tr>
|
||||
}
|
||||
</tbody>
|
||||
</table>
|
||||
}
|
||||
|
||||
@code {
|
||||
private IReadOnlyList<CertInfo> _rejected = [];
|
||||
private IReadOnlyList<CertInfo> _trusted = [];
|
||||
private string? _status;
|
||||
private string _statusKind = "success";
|
||||
|
||||
protected override void OnInitialized() => Reload();
|
||||
|
||||
private void Reload()
|
||||
{
|
||||
_rejected = Certs.ListRejected();
|
||||
_trusted = Certs.ListTrusted();
|
||||
}
|
||||
|
||||
private async Task TrustAsync(CertInfo c)
|
||||
{
|
||||
if (Certs.TrustRejected(c.Thumbprint))
|
||||
{
|
||||
await LogActionAsync("cert.trust", c);
|
||||
Set($"Trusted cert {c.Subject} ({Short(c.Thumbprint)}).", "success");
|
||||
}
|
||||
else
|
||||
{
|
||||
Set($"Could not trust {Short(c.Thumbprint)} — file missing; another admin may have already handled it.", "warning");
|
||||
}
|
||||
Reload();
|
||||
}
|
||||
|
||||
private async Task DeleteRejectedAsync(CertInfo c)
|
||||
{
|
||||
if (Certs.DeleteRejected(c.Thumbprint))
|
||||
{
|
||||
await LogActionAsync("cert.delete.rejected", c);
|
||||
Set($"Deleted rejected cert {c.Subject} ({Short(c.Thumbprint)}).", "success");
|
||||
}
|
||||
else
|
||||
{
|
||||
Set($"Could not delete {Short(c.Thumbprint)} — file missing.", "warning");
|
||||
}
|
||||
Reload();
|
||||
}
|
||||
|
||||
private async Task UntrustAsync(CertInfo c)
|
||||
{
|
||||
if (Certs.UntrustCert(c.Thumbprint))
|
||||
{
|
||||
await LogActionAsync("cert.untrust", c);
|
||||
Set($"Revoked trust for {c.Subject} ({Short(c.Thumbprint)}).", "success");
|
||||
}
|
||||
else
|
||||
{
|
||||
Set($"Could not revoke {Short(c.Thumbprint)} — file missing.", "warning");
|
||||
}
|
||||
Reload();
|
||||
}
|
||||
|
||||
private async Task LogActionAsync(string action, CertInfo c)
|
||||
{
|
||||
// Cert trust changes are operator-initiated and security-sensitive — Serilog captures the
|
||||
// user + thumbprint trail. CertTrustService also logs at Information on each filesystem
|
||||
// move/delete; this line ties the action to the authenticated admin user so the two logs
|
||||
// correlate. DB-level ConfigAuditLog persistence is deferred — its schema is
|
||||
// cluster-scoped and cert actions are cluster-agnostic.
|
||||
var state = await AuthState.GetAuthenticationStateAsync();
|
||||
var user = state.User.Identity?.Name ?? "(anonymous)";
|
||||
Log.LogInformation("Admin cert action: user={User} action={Action} thumbprint={Thumbprint} subject={Subject}",
|
||||
user, action, c.Thumbprint, c.Subject);
|
||||
}
|
||||
|
||||
private void Set(string message, string kind)
|
||||
{
|
||||
_status = message;
|
||||
_statusKind = kind;
|
||||
}
|
||||
|
||||
private void ClearStatus() => _status = null;
|
||||
|
||||
private static string Short(string thumbprint) =>
|
||||
thumbprint.Length > 12 ? thumbprint[..12] + "…" : thumbprint;
|
||||
}
|
||||
172
src/ZB.MOM.WW.OtOpcUa.Admin/Components/Pages/Fleet.razor
Normal file
172
src/ZB.MOM.WW.OtOpcUa.Admin/Components/Pages/Fleet.razor
Normal file
@@ -0,0 +1,172 @@
|
||||
@page "/fleet"
|
||||
@using Microsoft.EntityFrameworkCore
|
||||
@using ZB.MOM.WW.OtOpcUa.Configuration
|
||||
@using ZB.MOM.WW.OtOpcUa.Configuration.Entities
|
||||
@inject IServiceScopeFactory ScopeFactory
|
||||
@implements IDisposable
|
||||
|
||||
<h1 class="mb-4">Fleet status</h1>
|
||||
|
||||
<div class="d-flex align-items-center mb-3 gap-2">
|
||||
<button class="btn btn-sm btn-outline-primary" @onclick="RefreshAsync" disabled="@_refreshing">
|
||||
@if (_refreshing) { <span class="spinner-border spinner-border-sm me-1" /> }
|
||||
Refresh
|
||||
</button>
|
||||
<span class="text-muted small">
|
||||
Auto-refresh every @RefreshIntervalSeconds s. Last updated: @(_lastRefreshUtc?.ToString("HH:mm:ss 'UTC'") ?? "—")
|
||||
</span>
|
||||
</div>
|
||||
|
||||
@if (_rows is null)
|
||||
{
|
||||
<p>Loading…</p>
|
||||
}
|
||||
else if (_rows.Count == 0)
|
||||
{
|
||||
<div class="alert alert-info">
|
||||
No node state recorded yet. Nodes publish their state to the central DB on each poll; if
|
||||
this list is empty, either no nodes have been registered or the poller hasn't run yet.
|
||||
</div>
|
||||
}
|
||||
else
|
||||
{
|
||||
<div class="row g-3 mb-4">
|
||||
<div class="col-md-3">
|
||||
<div class="card"><div class="card-body">
|
||||
<h6 class="text-muted mb-1">Nodes</h6>
|
||||
<div class="fs-3">@_rows.Count</div>
|
||||
</div></div>
|
||||
</div>
|
||||
<div class="col-md-3">
|
||||
<div class="card border-success"><div class="card-body">
|
||||
<h6 class="text-muted mb-1">Applied</h6>
|
||||
<div class="fs-3 text-success">@_rows.Count(r => r.Status == "Applied")</div>
|
||||
</div></div>
|
||||
</div>
|
||||
<div class="col-md-3">
|
||||
<div class="card border-warning"><div class="card-body">
|
||||
<h6 class="text-muted mb-1">Stale</h6>
|
||||
<div class="fs-3 text-warning">@_rows.Count(r => IsStale(r))</div>
|
||||
</div></div>
|
||||
</div>
|
||||
<div class="col-md-3">
|
||||
<div class="card border-danger"><div class="card-body">
|
||||
<h6 class="text-muted mb-1">Failed</h6>
|
||||
<div class="fs-3 text-danger">@_rows.Count(r => r.Status == "Failed")</div>
|
||||
</div></div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<table class="table table-hover align-middle">
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Node</th>
|
||||
<th>Cluster</th>
|
||||
<th>Generation</th>
|
||||
<th>Status</th>
|
||||
<th>Last applied</th>
|
||||
<th>Last seen</th>
|
||||
<th>Error</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
@foreach (var r in _rows)
|
||||
{
|
||||
<tr class="@RowClass(r)">
|
||||
<td><code>@r.NodeId</code></td>
|
||||
<td>@r.ClusterId</td>
|
||||
<td>@(r.GenerationId?.ToString() ?? "—")</td>
|
||||
<td>
|
||||
<span class="badge @StatusBadge(r.Status)">@(r.Status ?? "—")</span>
|
||||
</td>
|
||||
<td>@FormatAge(r.AppliedAt)</td>
|
||||
<td class="@(IsStale(r) ? "text-warning" : "")">@FormatAge(r.SeenAt)</td>
|
||||
<td class="text-truncate" style="max-width: 320px;" title="@r.Error">@r.Error</td>
|
||||
</tr>
|
||||
}
|
||||
</tbody>
|
||||
</table>
|
||||
}
|
||||
|
||||
@code {
|
||||
// Refresh cadence. 5s matches FleetStatusPoller's poll interval — the dashboard always sees
|
||||
// the most recent published state without polling ahead of the broadcaster.
|
||||
private const int RefreshIntervalSeconds = 5;
|
||||
|
||||
private List<FleetNodeRow>? _rows;
|
||||
private bool _refreshing;
|
||||
private DateTime? _lastRefreshUtc;
|
||||
private Timer? _timer;
|
||||
|
||||
protected override async Task OnInitializedAsync()
|
||||
{
|
||||
await RefreshAsync();
|
||||
_timer = new Timer(async _ => await InvokeAsync(RefreshAsync),
|
||||
state: null,
|
||||
dueTime: TimeSpan.FromSeconds(RefreshIntervalSeconds),
|
||||
period: TimeSpan.FromSeconds(RefreshIntervalSeconds));
|
||||
}
|
||||
|
||||
private async Task RefreshAsync()
|
||||
{
|
||||
if (_refreshing) return;
|
||||
_refreshing = true;
|
||||
try
|
||||
{
|
||||
using var scope = ScopeFactory.CreateScope();
|
||||
var db = scope.ServiceProvider.GetRequiredService<OtOpcUaConfigDbContext>();
|
||||
var rows = await db.ClusterNodeGenerationStates.AsNoTracking()
|
||||
.Join(db.ClusterNodes.AsNoTracking(), s => s.NodeId, n => n.NodeId, (s, n) => new FleetNodeRow(
|
||||
s.NodeId, n.ClusterId, s.CurrentGenerationId,
|
||||
s.LastAppliedStatus != null ? s.LastAppliedStatus.ToString() : null,
|
||||
s.LastAppliedError, s.LastAppliedAt, s.LastSeenAt))
|
||||
.OrderBy(r => r.ClusterId)
|
||||
.ThenBy(r => r.NodeId)
|
||||
.ToListAsync();
|
||||
_rows = rows;
|
||||
_lastRefreshUtc = DateTime.UtcNow;
|
||||
}
|
||||
finally
|
||||
{
|
||||
_refreshing = false;
|
||||
StateHasChanged();
|
||||
}
|
||||
}
|
||||
|
||||
private static bool IsStale(FleetNodeRow r)
|
||||
{
|
||||
if (r.SeenAt is null) return true;
|
||||
return (DateTime.UtcNow - r.SeenAt.Value) > TimeSpan.FromSeconds(30);
|
||||
}
|
||||
|
||||
private static string RowClass(FleetNodeRow r) => r.Status switch
|
||||
{
|
||||
"Failed" => "table-danger",
|
||||
_ when IsStale(r) => "table-warning",
|
||||
_ => "",
|
||||
};
|
||||
|
||||
private static string StatusBadge(string? status) => status switch
|
||||
{
|
||||
"Applied" => "bg-success",
|
||||
"Failed" => "bg-danger",
|
||||
"Applying" => "bg-info",
|
||||
_ => "bg-secondary",
|
||||
};
|
||||
|
||||
private static string FormatAge(DateTime? t)
|
||||
{
|
||||
if (t is null) return "—";
|
||||
var age = DateTime.UtcNow - t.Value;
|
||||
if (age.TotalSeconds < 60) return $"{(int)age.TotalSeconds}s ago";
|
||||
if (age.TotalMinutes < 60) return $"{(int)age.TotalMinutes}m ago";
|
||||
if (age.TotalHours < 24) return $"{(int)age.TotalHours}h ago";
|
||||
return t.Value.ToString("yyyy-MM-dd HH:mm 'UTC'");
|
||||
}
|
||||
|
||||
public void Dispose() => _timer?.Dispose();
|
||||
|
||||
internal sealed record FleetNodeRow(
|
||||
string NodeId, string ClusterId, long? GenerationId,
|
||||
string? Status, string? Error, DateTime? AppliedAt, DateTime? SeenAt);
|
||||
}
|
||||
160
src/ZB.MOM.WW.OtOpcUa.Admin/Components/Pages/Hosts.razor
Normal file
160
src/ZB.MOM.WW.OtOpcUa.Admin/Components/Pages/Hosts.razor
Normal file
@@ -0,0 +1,160 @@
|
||||
@page "/hosts"
|
||||
@using Microsoft.EntityFrameworkCore
|
||||
@using ZB.MOM.WW.OtOpcUa.Admin.Services
|
||||
@using ZB.MOM.WW.OtOpcUa.Configuration.Enums
|
||||
@inject IServiceScopeFactory ScopeFactory
|
||||
@implements IDisposable
|
||||
|
||||
<h1 class="mb-4">Driver host status</h1>
|
||||
|
||||
<div class="d-flex align-items-center mb-3 gap-2">
|
||||
<button class="btn btn-sm btn-outline-primary" @onclick="RefreshAsync" disabled="@_refreshing">
|
||||
@if (_refreshing) { <span class="spinner-border spinner-border-sm me-1" /> }
|
||||
Refresh
|
||||
</button>
|
||||
<span class="text-muted small">
|
||||
Auto-refresh every @RefreshIntervalSeconds s. Last updated: @(_lastRefreshUtc?.ToString("HH:mm:ss 'UTC'") ?? "—")
|
||||
</span>
|
||||
</div>
|
||||
|
||||
<div class="alert alert-info small mb-4">
|
||||
Each row is one host reported by a driver instance on a server node. Galaxy drivers report
|
||||
per-Platform / per-AppEngine entries; Modbus drivers report the PLC endpoint. Rows age out
|
||||
of the Server's publisher on every 10-second heartbeat — rows whose LastSeen is older than
|
||||
30s are flagged Stale, which usually means the owning Server process has crashed or lost
|
||||
its DB connection.
|
||||
</div>
|
||||
|
||||
@if (_rows is null)
|
||||
{
|
||||
<p>Loading…</p>
|
||||
}
|
||||
else if (_rows.Count == 0)
|
||||
{
|
||||
<div class="alert alert-secondary">
|
||||
No host-status rows yet. The Server publishes its first tick 2s after startup; if this list stays empty, check that the Server is running and the driver implements <code>IHostConnectivityProbe</code>.
|
||||
</div>
|
||||
}
|
||||
else
|
||||
{
|
||||
<div class="row g-3 mb-4">
|
||||
<div class="col-md-3"><div class="card"><div class="card-body">
|
||||
<h6 class="text-muted mb-1">Hosts</h6>
|
||||
<div class="fs-3">@_rows.Count</div>
|
||||
</div></div></div>
|
||||
<div class="col-md-3"><div class="card border-success"><div class="card-body">
|
||||
<h6 class="text-muted mb-1">Running</h6>
|
||||
<div class="fs-3 text-success">@_rows.Count(r => r.State == DriverHostState.Running && !HostStatusService.IsStale(r))</div>
|
||||
</div></div></div>
|
||||
<div class="col-md-3"><div class="card border-warning"><div class="card-body">
|
||||
<h6 class="text-muted mb-1">Stale</h6>
|
||||
<div class="fs-3 text-warning">@_rows.Count(HostStatusService.IsStale)</div>
|
||||
</div></div></div>
|
||||
<div class="col-md-3"><div class="card border-danger"><div class="card-body">
|
||||
<h6 class="text-muted mb-1">Faulted</h6>
|
||||
<div class="fs-3 text-danger">@_rows.Count(r => r.State == DriverHostState.Faulted)</div>
|
||||
</div></div></div>
|
||||
</div>
|
||||
|
||||
@foreach (var cluster in _rows.GroupBy(r => r.ClusterId ?? "(unassigned)").OrderBy(g => g.Key))
|
||||
{
|
||||
<h2 class="h5 mt-4">Cluster: <code>@cluster.Key</code></h2>
|
||||
<table class="table table-sm table-hover align-middle">
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Node</th>
|
||||
<th>Driver</th>
|
||||
<th>Host</th>
|
||||
<th>State</th>
|
||||
<th>Last transition</th>
|
||||
<th>Last seen</th>
|
||||
<th>Detail</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
@foreach (var r in cluster)
|
||||
{
|
||||
<tr class="@RowClass(r)">
|
||||
<td><code>@r.NodeId</code></td>
|
||||
<td><code>@r.DriverInstanceId</code></td>
|
||||
<td>@r.HostName</td>
|
||||
<td>
|
||||
<span class="badge @StateBadge(r.State)">@r.State</span>
|
||||
@if (HostStatusService.IsStale(r))
|
||||
{
|
||||
<span class="badge bg-warning text-dark ms-1">Stale</span>
|
||||
}
|
||||
</td>
|
||||
<td class="small">@FormatAge(r.StateChangedUtc)</td>
|
||||
<td class="small @(HostStatusService.IsStale(r) ? "text-warning" : "")">@FormatAge(r.LastSeenUtc)</td>
|
||||
<td class="text-truncate small" style="max-width: 320px;" title="@r.Detail">@r.Detail</td>
|
||||
</tr>
|
||||
}
|
||||
</tbody>
|
||||
</table>
|
||||
}
|
||||
}
|
||||
|
||||
@code {
|
||||
// Mirrors HostStatusPublisher.HeartbeatInterval — polling ahead of the broadcaster
|
||||
// produces stale-looking rows mid-cycle.
|
||||
private const int RefreshIntervalSeconds = 10;
|
||||
|
||||
private List<HostStatusRow>? _rows;
|
||||
private bool _refreshing;
|
||||
private DateTime? _lastRefreshUtc;
|
||||
private Timer? _timer;
|
||||
|
||||
protected override async Task OnInitializedAsync()
|
||||
{
|
||||
await RefreshAsync();
|
||||
_timer = new Timer(async _ => await InvokeAsync(RefreshAsync),
|
||||
state: null,
|
||||
dueTime: TimeSpan.FromSeconds(RefreshIntervalSeconds),
|
||||
period: TimeSpan.FromSeconds(RefreshIntervalSeconds));
|
||||
}
|
||||
|
||||
private async Task RefreshAsync()
|
||||
{
|
||||
if (_refreshing) return;
|
||||
_refreshing = true;
|
||||
try
|
||||
{
|
||||
using var scope = ScopeFactory.CreateScope();
|
||||
var svc = scope.ServiceProvider.GetRequiredService<HostStatusService>();
|
||||
_rows = (await svc.ListAsync()).ToList();
|
||||
_lastRefreshUtc = DateTime.UtcNow;
|
||||
}
|
||||
finally
|
||||
{
|
||||
_refreshing = false;
|
||||
StateHasChanged();
|
||||
}
|
||||
}
|
||||
|
||||
private static string RowClass(HostStatusRow r) => r.State switch
|
||||
{
|
||||
DriverHostState.Faulted => "table-danger",
|
||||
_ when HostStatusService.IsStale(r) => "table-warning",
|
||||
_ => "",
|
||||
};
|
||||
|
||||
private static string StateBadge(DriverHostState s) => s switch
|
||||
{
|
||||
DriverHostState.Running => "bg-success",
|
||||
DriverHostState.Stopped => "bg-secondary",
|
||||
DriverHostState.Faulted => "bg-danger",
|
||||
_ => "bg-secondary",
|
||||
};
|
||||
|
||||
private static string FormatAge(DateTime t)
|
||||
{
|
||||
var age = DateTime.UtcNow - t;
|
||||
if (age.TotalSeconds < 60) return $"{(int)age.TotalSeconds}s ago";
|
||||
if (age.TotalMinutes < 60) return $"{(int)age.TotalMinutes}m ago";
|
||||
if (age.TotalHours < 24) return $"{(int)age.TotalHours}h ago";
|
||||
return t.ToString("yyyy-MM-dd HH:mm 'UTC'");
|
||||
}
|
||||
|
||||
public void Dispose() => _timer?.Dispose();
|
||||
}
|
||||
@@ -47,6 +47,13 @@ builder.Services.AddScoped<NodeAclService>();
|
||||
builder.Services.AddScoped<ReservationService>();
|
||||
builder.Services.AddScoped<DraftValidationService>();
|
||||
builder.Services.AddScoped<AuditLogService>();
|
||||
builder.Services.AddScoped<HostStatusService>();
|
||||
|
||||
// Cert-trust management — reads the OPC UA server's PKI store root so rejected client certs
|
||||
// can be promoted to trusted via the Admin UI. Singleton: no per-request state, just
|
||||
// filesystem operations.
|
||||
builder.Services.Configure<CertTrustOptions>(builder.Configuration.GetSection(CertTrustOptions.SectionName));
|
||||
builder.Services.AddSingleton<CertTrustService>();
|
||||
|
||||
// LDAP auth — parity with ScadaLink's LdapAuthService (decision #102).
|
||||
builder.Services.Configure<LdapOptions>(
|
||||
|
||||
22
src/ZB.MOM.WW.OtOpcUa.Admin/Services/CertTrustOptions.cs
Normal file
22
src/ZB.MOM.WW.OtOpcUa.Admin/Services/CertTrustOptions.cs
Normal file
@@ -0,0 +1,22 @@
|
||||
namespace ZB.MOM.WW.OtOpcUa.Admin.Services;
|
||||
|
||||
/// <summary>
|
||||
/// Points the Admin UI at the OPC UA Server's PKI store root so
|
||||
/// <see cref="CertTrustService"/> can list and move certs between the
|
||||
/// <c>rejected/</c> and <c>trusted/</c> directories the server maintains. Must match the
|
||||
/// <c>OpcUaServer:PkiStoreRoot</c> the Server process is configured with.
|
||||
/// </summary>
|
||||
public sealed class CertTrustOptions
|
||||
{
|
||||
public const string SectionName = "CertTrust";
|
||||
|
||||
/// <summary>
|
||||
/// Absolute path to the PKI root. Defaults to
|
||||
/// <c>%ProgramData%\OtOpcUa\pki</c> — matches <c>OpcUaServerOptions.PkiStoreRoot</c>'s
|
||||
/// default so a standard side-by-side install needs no override.
|
||||
/// </summary>
|
||||
public string PkiStoreRoot { get; init; } =
|
||||
Path.Combine(
|
||||
Environment.GetFolderPath(Environment.SpecialFolder.CommonApplicationData),
|
||||
"OtOpcUa", "pki");
|
||||
}
|
||||
135
src/ZB.MOM.WW.OtOpcUa.Admin/Services/CertTrustService.cs
Normal file
135
src/ZB.MOM.WW.OtOpcUa.Admin/Services/CertTrustService.cs
Normal file
@@ -0,0 +1,135 @@
|
||||
using System.Security.Cryptography.X509Certificates;
|
||||
using Microsoft.Extensions.Options;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Admin.Services;
|
||||
|
||||
/// <summary>
|
||||
/// Metadata for a certificate file found in one of the OPC UA server's PKI stores. The
|
||||
/// <see cref="FilePath"/> is the absolute path of the DER/CRT file the stack created when it
|
||||
/// rejected the cert (for <see cref="CertStoreKind.Rejected"/>) or when an operator trusted
|
||||
/// it (for <see cref="CertStoreKind.Trusted"/>).
|
||||
/// </summary>
|
||||
public sealed record CertInfo(
|
||||
string Thumbprint,
|
||||
string Subject,
|
||||
string Issuer,
|
||||
DateTime NotBefore,
|
||||
DateTime NotAfter,
|
||||
string FilePath,
|
||||
CertStoreKind Store);
|
||||
|
||||
public enum CertStoreKind
|
||||
{
|
||||
Rejected,
|
||||
Trusted,
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Filesystem-backed view over the OPC UA server's PKI store. The Opc.Ua stack uses a
|
||||
/// Directory-typed store — each cert is a <c>.der</c> file under <c>{root}/{store}/certs/</c>
|
||||
/// with a filename derived from subject + thumbprint. This service exposes operators for the
|
||||
/// Admin UI: list rejected, list trusted, trust a rejected cert (move to trusted), remove a
|
||||
/// rejected cert (delete), untrust a previously trusted cert (delete from trusted).
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// The Admin process is separate from the Server process; this service deliberately has no
|
||||
/// Opc.Ua dependency — it works on the on-disk layout directly so it can run on the Admin
|
||||
/// host even when the Server isn't installed locally, as long as the PKI root is reachable
|
||||
/// (typical deployment has Admin + Server side-by-side on the same machine).
|
||||
///
|
||||
/// Trust/untrust requires the Server to re-read its trust list. The Opc.Ua stack re-reads
|
||||
/// the Directory store on each new incoming connection, so there's no explicit signal
|
||||
/// needed — the next client handshake picks up the change. Operators should retry the
|
||||
/// rejected client's connection after trusting.
|
||||
/// </remarks>
|
||||
public sealed class CertTrustService
|
||||
{
|
||||
private readonly CertTrustOptions _options;
|
||||
private readonly ILogger<CertTrustService> _logger;
|
||||
|
||||
public CertTrustService(IOptions<CertTrustOptions> options, ILogger<CertTrustService> logger)
|
||||
{
|
||||
_options = options.Value;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
public string PkiStoreRoot => _options.PkiStoreRoot;
|
||||
|
||||
public IReadOnlyList<CertInfo> ListRejected() => ListStore(CertStoreKind.Rejected);
|
||||
public IReadOnlyList<CertInfo> ListTrusted() => ListStore(CertStoreKind.Trusted);
|
||||
|
||||
/// <summary>
|
||||
/// Move the cert with <paramref name="thumbprint"/> from the rejected store to the
|
||||
/// trusted store. No-op returns false if the rejected file doesn't exist (already moved
|
||||
/// by another operator, or thumbprint mismatch). Overwrites an existing trusted copy
|
||||
/// silently — idempotent.
|
||||
/// </summary>
|
||||
public bool TrustRejected(string thumbprint)
|
||||
{
|
||||
var cert = FindInStore(CertStoreKind.Rejected, thumbprint);
|
||||
if (cert is null) return false;
|
||||
|
||||
var trustedDir = CertsDir(CertStoreKind.Trusted);
|
||||
Directory.CreateDirectory(trustedDir);
|
||||
var destPath = Path.Combine(trustedDir, Path.GetFileName(cert.FilePath));
|
||||
File.Move(cert.FilePath, destPath, overwrite: true);
|
||||
_logger.LogInformation("Trusted cert {Thumbprint} (subject={Subject}) — moved {From} → {To}",
|
||||
cert.Thumbprint, cert.Subject, cert.FilePath, destPath);
|
||||
return true;
|
||||
}
|
||||
|
||||
public bool DeleteRejected(string thumbprint) => DeleteFromStore(CertStoreKind.Rejected, thumbprint);
|
||||
public bool UntrustCert(string thumbprint) => DeleteFromStore(CertStoreKind.Trusted, thumbprint);
|
||||
|
||||
private bool DeleteFromStore(CertStoreKind store, string thumbprint)
|
||||
{
|
||||
var cert = FindInStore(store, thumbprint);
|
||||
if (cert is null) return false;
|
||||
File.Delete(cert.FilePath);
|
||||
_logger.LogInformation("Deleted cert {Thumbprint} (subject={Subject}) from {Store} store",
|
||||
cert.Thumbprint, cert.Subject, store);
|
||||
return true;
|
||||
}
|
||||
|
||||
private CertInfo? FindInStore(CertStoreKind store, string thumbprint) =>
|
||||
ListStore(store).FirstOrDefault(c =>
|
||||
string.Equals(c.Thumbprint, thumbprint, StringComparison.OrdinalIgnoreCase));
|
||||
|
||||
private IReadOnlyList<CertInfo> ListStore(CertStoreKind store)
|
||||
{
|
||||
var dir = CertsDir(store);
|
||||
if (!Directory.Exists(dir)) return [];
|
||||
|
||||
var results = new List<CertInfo>();
|
||||
foreach (var path in Directory.EnumerateFiles(dir))
|
||||
{
|
||||
// Skip CRL sidecars + private-key files — trust operations only concern public certs.
|
||||
var ext = Path.GetExtension(path);
|
||||
if (!ext.Equals(".der", StringComparison.OrdinalIgnoreCase) &&
|
||||
!ext.Equals(".crt", StringComparison.OrdinalIgnoreCase) &&
|
||||
!ext.Equals(".cer", StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
var cert = X509CertificateLoader.LoadCertificateFromFile(path);
|
||||
results.Add(new CertInfo(
|
||||
cert.Thumbprint, cert.Subject, cert.Issuer,
|
||||
cert.NotBefore.ToUniversalTime(), cert.NotAfter.ToUniversalTime(),
|
||||
path, store));
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
// A malformed file in the store shouldn't take down the page. Surface it in logs
|
||||
// but skip — operators see the other certs and can clean the bad file manually.
|
||||
_logger.LogWarning(ex, "Failed to parse cert at {Path} — skipping", path);
|
||||
}
|
||||
}
|
||||
return results;
|
||||
}
|
||||
|
||||
private string CertsDir(CertStoreKind store) =>
|
||||
Path.Combine(_options.PkiStoreRoot, store == CertStoreKind.Rejected ? "rejected" : "trusted", "certs");
|
||||
}
|
||||
63
src/ZB.MOM.WW.OtOpcUa.Admin/Services/HostStatusService.cs
Normal file
63
src/ZB.MOM.WW.OtOpcUa.Admin/Services/HostStatusService.cs
Normal file
@@ -0,0 +1,63 @@
|
||||
using Microsoft.EntityFrameworkCore;
|
||||
using ZB.MOM.WW.OtOpcUa.Configuration;
|
||||
using ZB.MOM.WW.OtOpcUa.Configuration.Entities;
|
||||
using ZB.MOM.WW.OtOpcUa.Configuration.Enums;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Admin.Services;
|
||||
|
||||
/// <summary>
|
||||
/// One row per <see cref="DriverHostStatus"/> record, enriched with the owning
|
||||
/// <c>ClusterNode.ClusterId</c> when available (left-join). The Admin <c>/hosts</c> page
|
||||
/// groups by cluster and renders a per-node → per-driver → per-host tree.
|
||||
/// </summary>
|
||||
public sealed record HostStatusRow(
|
||||
string NodeId,
|
||||
string? ClusterId,
|
||||
string DriverInstanceId,
|
||||
string HostName,
|
||||
DriverHostState State,
|
||||
DateTime StateChangedUtc,
|
||||
DateTime LastSeenUtc,
|
||||
string? Detail);
|
||||
|
||||
/// <summary>
|
||||
/// Read-side service for the Admin UI's per-host drill-down. Loads
|
||||
/// <see cref="DriverHostStatus"/> rows (written by the Server process's
|
||||
/// <c>HostStatusPublisher</c>) and left-joins <c>ClusterNode</c> so each row knows which
|
||||
/// cluster it belongs to — the Admin UI groups by cluster for the fleet-wide view.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// The publisher heartbeat is 10s (<c>HostStatusPublisher.HeartbeatInterval</c>). The
|
||||
/// Admin page also polls every ~10s and treats rows with <c>LastSeenUtc</c> older than
|
||||
/// <c>StaleThreshold</c> (30s) as stale — covers a missed heartbeat tolerance plus
|
||||
/// a generous buffer for clock skew and publisher GC pauses.
|
||||
/// </remarks>
|
||||
public sealed class HostStatusService(OtOpcUaConfigDbContext db)
|
||||
{
|
||||
public static readonly TimeSpan StaleThreshold = TimeSpan.FromSeconds(30);
|
||||
|
||||
public async Task<IReadOnlyList<HostStatusRow>> ListAsync(CancellationToken ct = default)
|
||||
{
|
||||
// LEFT JOIN on NodeId so a row persists even when its owning ClusterNode row hasn't
|
||||
// been created yet (first-boot bootstrap case — keeps the UI from losing sight of
|
||||
// the reporting server).
|
||||
var rows = await (from s in db.DriverHostStatuses.AsNoTracking()
|
||||
join n in db.ClusterNodes.AsNoTracking()
|
||||
on s.NodeId equals n.NodeId into nodeJoin
|
||||
from n in nodeJoin.DefaultIfEmpty()
|
||||
orderby s.NodeId, s.DriverInstanceId, s.HostName
|
||||
select new HostStatusRow(
|
||||
s.NodeId,
|
||||
n != null ? n.ClusterId : null,
|
||||
s.DriverInstanceId,
|
||||
s.HostName,
|
||||
s.State,
|
||||
s.StateChangedUtc,
|
||||
s.LastSeenUtc,
|
||||
s.Detail)).ToListAsync(ct);
|
||||
return rows;
|
||||
}
|
||||
|
||||
public static bool IsStale(HostStatusRow row) =>
|
||||
DateTime.UtcNow - row.LastSeenUtc > StaleThreshold;
|
||||
}
|
||||
@@ -0,0 +1,61 @@
|
||||
using ZB.MOM.WW.OtOpcUa.Configuration.Enums;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Configuration.Entities;
|
||||
|
||||
/// <summary>
|
||||
/// Per-host connectivity snapshot the Server publishes for each driver's
|
||||
/// <c>IHostConnectivityProbe.GetHostStatuses</c> entry. One row per
|
||||
/// (<see cref="NodeId"/>, <see cref="DriverInstanceId"/>, <see cref="HostName"/>) triple —
|
||||
/// a redundant 2-node cluster with one Galaxy driver reporting 3 platforms produces 6
|
||||
/// rows, not 3, because each server node owns its own runtime view.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// Closes the data-layer piece of LMX follow-up #7 (per-AppEngine Admin dashboard
|
||||
/// drill-down). The publisher hosted service on the Server side subscribes to every
|
||||
/// registered driver's <c>OnHostStatusChanged</c> and upserts rows on transitions +
|
||||
/// periodic liveness heartbeats. <see cref="LastSeenUtc"/> advances on every
|
||||
/// heartbeat so the Admin UI can flag stale rows from a crashed Server.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// No foreign-key to <see cref="ClusterNode"/> — a Server may start reporting host
|
||||
/// status before its ClusterNode row exists (e.g. first-boot bootstrap), and we'd
|
||||
/// rather keep the status row than drop it. The Admin-side service left-joins on
|
||||
/// NodeId when presenting rows.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public sealed class DriverHostStatus
|
||||
{
|
||||
/// <summary>Server node that's running the driver.</summary>
|
||||
public required string NodeId { get; set; }
|
||||
|
||||
/// <summary>Driver instance's stable id (matches <c>IDriver.DriverInstanceId</c>).</summary>
|
||||
public required string DriverInstanceId { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Driver-side host identifier — Galaxy Platform / AppEngine name, Modbus
|
||||
/// <c>host:port</c>, whatever the probe returns. Opaque to the Admin UI except as
|
||||
/// a display string.
|
||||
/// </summary>
|
||||
public required string HostName { get; set; }
|
||||
|
||||
public DriverHostState State { get; set; } = DriverHostState.Unknown;
|
||||
|
||||
/// <summary>Timestamp of the last state transition (not of the most recent heartbeat).</summary>
|
||||
public DateTime StateChangedUtc { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Advances on every publisher heartbeat — the Admin UI uses
|
||||
/// <c>now - LastSeenUtc > threshold</c> to flag rows whose owning Server has
|
||||
/// stopped reporting (crashed, network-partitioned, etc.), independent of
|
||||
/// <see cref="State"/>.
|
||||
/// </summary>
|
||||
public DateTime LastSeenUtc { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Optional human-readable detail populated when <see cref="State"/> is
|
||||
/// <see cref="DriverHostState.Faulted"/> — e.g. the exception message from the
|
||||
/// driver's probe. Null for Running / Stopped / Unknown transitions.
|
||||
/// </summary>
|
||||
public string? Detail { get; set; }
|
||||
}
|
||||
21
src/ZB.MOM.WW.OtOpcUa.Configuration/Enums/DriverHostState.cs
Normal file
21
src/ZB.MOM.WW.OtOpcUa.Configuration/Enums/DriverHostState.cs
Normal file
@@ -0,0 +1,21 @@
|
||||
namespace ZB.MOM.WW.OtOpcUa.Configuration.Enums;
|
||||
|
||||
/// <summary>
|
||||
/// Persisted mirror of <c>Core.Abstractions.HostState</c> — the lifecycle state each
|
||||
/// <c>IHostConnectivityProbe</c>-capable driver reports for its per-host topology
|
||||
/// (Galaxy Platforms / AppEngines, Modbus PLC endpoints, future OPC UA gateway upstreams).
|
||||
/// Defined here instead of re-using <c>Core.Abstractions.HostState</c> so the
|
||||
/// Configuration project stays free of driver-runtime dependencies.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// The server-side publisher (follow-up PR) translates
|
||||
/// <c>HostStatusChangedEventArgs.NewState</c> to this enum on every transition and
|
||||
/// upserts into <see cref="Entities.DriverHostStatus"/>. Admin UI reads from the DB.
|
||||
/// </remarks>
|
||||
public enum DriverHostState
|
||||
{
|
||||
Unknown,
|
||||
Running,
|
||||
Stopped,
|
||||
Faulted,
|
||||
}
|
||||
1248
src/ZB.MOM.WW.OtOpcUa.Configuration/Migrations/20260418193608_AddDriverHostStatus.Designer.cs
generated
Normal file
1248
src/ZB.MOM.WW.OtOpcUa.Configuration/Migrations/20260418193608_AddDriverHostStatus.Designer.cs
generated
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,49 @@
|
||||
using System;
|
||||
using Microsoft.EntityFrameworkCore.Migrations;
|
||||
|
||||
#nullable disable
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Configuration.Migrations
|
||||
{
|
||||
/// <inheritdoc />
|
||||
public partial class AddDriverHostStatus : Migration
|
||||
{
|
||||
/// <inheritdoc />
|
||||
protected override void Up(MigrationBuilder migrationBuilder)
|
||||
{
|
||||
migrationBuilder.CreateTable(
|
||||
name: "DriverHostStatus",
|
||||
columns: table => new
|
||||
{
|
||||
NodeId = table.Column<string>(type: "nvarchar(64)", maxLength: 64, nullable: false),
|
||||
DriverInstanceId = table.Column<string>(type: "nvarchar(64)", maxLength: 64, nullable: false),
|
||||
HostName = table.Column<string>(type: "nvarchar(256)", maxLength: 256, nullable: false),
|
||||
State = table.Column<string>(type: "nvarchar(16)", maxLength: 16, nullable: false),
|
||||
StateChangedUtc = table.Column<DateTime>(type: "datetime2(3)", nullable: false),
|
||||
LastSeenUtc = table.Column<DateTime>(type: "datetime2(3)", nullable: false),
|
||||
Detail = table.Column<string>(type: "nvarchar(1024)", maxLength: 1024, nullable: true)
|
||||
},
|
||||
constraints: table =>
|
||||
{
|
||||
table.PrimaryKey("PK_DriverHostStatus", x => new { x.NodeId, x.DriverInstanceId, x.HostName });
|
||||
});
|
||||
|
||||
migrationBuilder.CreateIndex(
|
||||
name: "IX_DriverHostStatus_LastSeen",
|
||||
table: "DriverHostStatus",
|
||||
column: "LastSeenUtc");
|
||||
|
||||
migrationBuilder.CreateIndex(
|
||||
name: "IX_DriverHostStatus_Node",
|
||||
table: "DriverHostStatus",
|
||||
column: "NodeId");
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
protected override void Down(MigrationBuilder migrationBuilder)
|
||||
{
|
||||
migrationBuilder.DropTable(
|
||||
name: "DriverHostStatus");
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -332,6 +332,46 @@ namespace ZB.MOM.WW.OtOpcUa.Configuration.Migrations
|
||||
});
|
||||
});
|
||||
|
||||
modelBuilder.Entity("ZB.MOM.WW.OtOpcUa.Configuration.Entities.DriverHostStatus", b =>
|
||||
{
|
||||
b.Property<string>("NodeId")
|
||||
.HasMaxLength(64)
|
||||
.HasColumnType("nvarchar(64)");
|
||||
|
||||
b.Property<string>("DriverInstanceId")
|
||||
.HasMaxLength(64)
|
||||
.HasColumnType("nvarchar(64)");
|
||||
|
||||
b.Property<string>("HostName")
|
||||
.HasMaxLength(256)
|
||||
.HasColumnType("nvarchar(256)");
|
||||
|
||||
b.Property<string>("Detail")
|
||||
.HasMaxLength(1024)
|
||||
.HasColumnType("nvarchar(1024)");
|
||||
|
||||
b.Property<DateTime>("LastSeenUtc")
|
||||
.HasColumnType("datetime2(3)");
|
||||
|
||||
b.Property<string>("State")
|
||||
.IsRequired()
|
||||
.HasMaxLength(16)
|
||||
.HasColumnType("nvarchar(16)");
|
||||
|
||||
b.Property<DateTime>("StateChangedUtc")
|
||||
.HasColumnType("datetime2(3)");
|
||||
|
||||
b.HasKey("NodeId", "DriverInstanceId", "HostName");
|
||||
|
||||
b.HasIndex("LastSeenUtc")
|
||||
.HasDatabaseName("IX_DriverHostStatus_LastSeen");
|
||||
|
||||
b.HasIndex("NodeId")
|
||||
.HasDatabaseName("IX_DriverHostStatus_Node");
|
||||
|
||||
b.ToTable("DriverHostStatus", (string)null);
|
||||
});
|
||||
|
||||
modelBuilder.Entity("ZB.MOM.WW.OtOpcUa.Configuration.Entities.DriverInstance", b =>
|
||||
{
|
||||
b.Property<Guid>("DriverInstanceRowId")
|
||||
|
||||
@@ -27,6 +27,7 @@ public sealed class OtOpcUaConfigDbContext(DbContextOptions<OtOpcUaConfigDbConte
|
||||
public DbSet<ClusterNodeGenerationState> ClusterNodeGenerationStates => Set<ClusterNodeGenerationState>();
|
||||
public DbSet<ConfigAuditLog> ConfigAuditLogs => Set<ConfigAuditLog>();
|
||||
public DbSet<ExternalIdReservation> ExternalIdReservations => Set<ExternalIdReservation>();
|
||||
public DbSet<DriverHostStatus> DriverHostStatuses => Set<DriverHostStatus>();
|
||||
|
||||
protected override void OnModelCreating(ModelBuilder modelBuilder)
|
||||
{
|
||||
@@ -47,6 +48,7 @@ public sealed class OtOpcUaConfigDbContext(DbContextOptions<OtOpcUaConfigDbConte
|
||||
ConfigureClusterNodeGenerationState(modelBuilder);
|
||||
ConfigureConfigAuditLog(modelBuilder);
|
||||
ConfigureExternalIdReservation(modelBuilder);
|
||||
ConfigureDriverHostStatus(modelBuilder);
|
||||
}
|
||||
|
||||
private static void ConfigureServerCluster(ModelBuilder modelBuilder)
|
||||
@@ -484,4 +486,30 @@ public sealed class OtOpcUaConfigDbContext(DbContextOptions<OtOpcUaConfigDbConte
|
||||
e.HasIndex(x => x.EquipmentUuid).HasDatabaseName("IX_ExternalIdReservation_Equipment");
|
||||
});
|
||||
}
|
||||
|
||||
private static void ConfigureDriverHostStatus(ModelBuilder modelBuilder)
|
||||
{
|
||||
modelBuilder.Entity<DriverHostStatus>(e =>
|
||||
{
|
||||
e.ToTable("DriverHostStatus");
|
||||
// Composite key — one row per (server node, driver instance, probe-reported host).
|
||||
// A redundant 2-node cluster with one Galaxy driver reporting 3 platforms produces
|
||||
// 6 rows because each server node owns its own runtime view; the composite key is
|
||||
// what lets both views coexist without shadowing each other.
|
||||
e.HasKey(x => new { x.NodeId, x.DriverInstanceId, x.HostName });
|
||||
e.Property(x => x.NodeId).HasMaxLength(64);
|
||||
e.Property(x => x.DriverInstanceId).HasMaxLength(64);
|
||||
e.Property(x => x.HostName).HasMaxLength(256);
|
||||
e.Property(x => x.State).HasConversion<string>().HasMaxLength(16);
|
||||
e.Property(x => x.StateChangedUtc).HasColumnType("datetime2(3)");
|
||||
e.Property(x => x.LastSeenUtc).HasColumnType("datetime2(3)");
|
||||
e.Property(x => x.Detail).HasMaxLength(1024);
|
||||
|
||||
// NodeId-only index drives the Admin UI's per-cluster drill-down (select all host
|
||||
// statuses for the nodes of a specific cluster via join on ClusterNode.ClusterId).
|
||||
e.HasIndex(x => x.NodeId).HasDatabaseName("IX_DriverHostStatus_Node");
|
||||
// LastSeenUtc index powers the Admin UI's stale-row query (now - LastSeen > N).
|
||||
e.HasIndex(x => x.LastSeenUtc).HasDatabaseName("IX_DriverHostStatus_LastSeen");
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -25,6 +25,14 @@ namespace ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||
/// OPC UA <c>AlarmConditionState</c> when true. Defaults to false so existing non-Galaxy
|
||||
/// drivers aren't forced to flow a flag they don't produce.
|
||||
/// </param>
|
||||
/// <param name="WriteIdempotent">
|
||||
/// True when a timed-out or failed write to this attribute is safe to replay. Per
|
||||
/// <c>docs/v2/plan.md</c> decisions #44, #45, #143 — writes are NOT auto-retried by default
|
||||
/// because replaying a pulse / alarm-ack / counter-increment / recipe-step advance can
|
||||
/// duplicate field actions. Drivers flag only tags whose semantics make retry safe
|
||||
/// (holding registers with level-set values, set-point writes to analog tags) — the
|
||||
/// capability invoker respects this flag when deciding whether to apply Polly retry.
|
||||
/// </param>
|
||||
public sealed record DriverAttributeInfo(
|
||||
string FullName,
|
||||
DriverDataType DriverDataType,
|
||||
@@ -32,4 +40,5 @@ public sealed record DriverAttributeInfo(
|
||||
uint? ArrayDim,
|
||||
SecurityClassification SecurityClass,
|
||||
bool IsHistorized,
|
||||
bool IsAlarm = false);
|
||||
bool IsAlarm = false,
|
||||
bool WriteIdempotent = false);
|
||||
|
||||
42
src/ZB.MOM.WW.OtOpcUa.Core.Abstractions/DriverCapability.cs
Normal file
42
src/ZB.MOM.WW.OtOpcUa.Core.Abstractions/DriverCapability.cs
Normal file
@@ -0,0 +1,42 @@
|
||||
namespace ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||
|
||||
/// <summary>
|
||||
/// Enumerates the driver-capability surface points guarded by Phase 6.1 resilience pipelines.
|
||||
/// Each value corresponds to one method (or tightly-related method group) on the
|
||||
/// <c>Core.Abstractions</c> capability interfaces (<see cref="IReadable"/>, <see cref="IWritable"/>,
|
||||
/// <see cref="ITagDiscovery"/>, <see cref="ISubscribable"/>, <see cref="IHostConnectivityProbe"/>,
|
||||
/// <see cref="IAlarmSource"/>, <see cref="IHistoryProvider"/>).
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// Per <c>docs/v2/plan.md</c> decision #143 (per-capability retry policy): Read / HistoryRead /
|
||||
/// Discover / Probe / AlarmSubscribe auto-retry; <see cref="Write"/> does NOT retry unless the
|
||||
/// tag-definition carries <see cref="WriteIdempotentAttribute"/>. Alarm-acknowledge is treated
|
||||
/// as a write for retry semantics (an alarm-ack is not idempotent at the plant-floor acknowledgement
|
||||
/// level even if the OPC UA spec permits re-issue).
|
||||
/// </remarks>
|
||||
public enum DriverCapability
|
||||
{
|
||||
/// <summary>Batch <see cref="IReadable.ReadAsync"/>. Retries by default.</summary>
|
||||
Read,
|
||||
|
||||
/// <summary>Batch <see cref="IWritable.WriteAsync"/>. Does not retry unless tag is <see cref="WriteIdempotentAttribute">idempotent</see>.</summary>
|
||||
Write,
|
||||
|
||||
/// <summary><see cref="ITagDiscovery.DiscoverAsync"/>. Retries by default.</summary>
|
||||
Discover,
|
||||
|
||||
/// <summary><see cref="ISubscribable.SubscribeAsync"/> and unsubscribe. Retries by default.</summary>
|
||||
Subscribe,
|
||||
|
||||
/// <summary><see cref="IHostConnectivityProbe"/> probe loop. Retries by default.</summary>
|
||||
Probe,
|
||||
|
||||
/// <summary><see cref="IAlarmSource.SubscribeAlarmsAsync"/>. Retries by default.</summary>
|
||||
AlarmSubscribe,
|
||||
|
||||
/// <summary><see cref="IAlarmSource.AcknowledgeAsync"/>. Does NOT retry — ack is a write-shaped operation (decision #143).</summary>
|
||||
AlarmAcknowledge,
|
||||
|
||||
/// <summary><see cref="IHistoryProvider"/> reads (Raw/Processed/AtTime/Events). Retries by default.</summary>
|
||||
HistoryRead,
|
||||
}
|
||||
34
src/ZB.MOM.WW.OtOpcUa.Core.Abstractions/DriverTier.cs
Normal file
34
src/ZB.MOM.WW.OtOpcUa.Core.Abstractions/DriverTier.cs
Normal file
@@ -0,0 +1,34 @@
|
||||
namespace ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||
|
||||
/// <summary>
|
||||
/// Stability tier of a driver type. Determines which cross-cutting runtime protections
|
||||
/// apply — per-tier retry defaults, memory-tracking thresholds, and whether out-of-process
|
||||
/// supervision with process-level recycle is in play.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// Per <c>docs/v2/driver-stability.md</c> §2-4 and <c>docs/v2/plan.md</c> decisions #63-74.
|
||||
///
|
||||
/// <list type="bullet">
|
||||
/// <item><b>A</b> — managed, known-good SDK; low blast radius. In-process. Fast retries.
|
||||
/// Examples: OPC UA Client (OPCFoundation stack), S7 (S7NetPlus).</item>
|
||||
/// <item><b>B</b> — native or semi-trusted SDK with an in-process footprint. Examples: Modbus.</item>
|
||||
/// <item><b>C</b> — unmanaged SDK with COM/STA constraints, leak risk, or other out-of-process
|
||||
/// requirements. Must run as a separate Host process behind a Proxy with a supervisor that
|
||||
/// can recycle the process on hard-breach. Example: Galaxy (MXAccess COM).</item>
|
||||
/// </list>
|
||||
///
|
||||
/// <para>Process-kill protections (<c>MemoryRecycle</c>, <c>ScheduledRecycleScheduler</c>) are
|
||||
/// Tier C only per decisions #73-74 and #145 — killing an in-process Tier A/B driver also kills
|
||||
/// every OPC UA session and every co-hosted driver, blast-radius worse than the leak.</para>
|
||||
/// </remarks>
|
||||
public enum DriverTier
|
||||
{
|
||||
/// <summary>Managed SDK, in-process, low blast radius.</summary>
|
||||
A,
|
||||
|
||||
/// <summary>Native or semi-trusted SDK, in-process.</summary>
|
||||
B,
|
||||
|
||||
/// <summary>Unmanaged SDK, out-of-process required with Proxy+Host+Supervisor.</summary>
|
||||
C,
|
||||
}
|
||||
@@ -69,12 +69,20 @@ public sealed class DriverTypeRegistry
|
||||
/// <param name="DriverConfigJsonSchema">JSON Schema (Draft 2020-12) the driver's <c>DriverConfig</c> column must validate against.</param>
|
||||
/// <param name="DeviceConfigJsonSchema">JSON Schema for <c>DeviceConfig</c> (multi-device drivers); null if the driver has no device layer.</param>
|
||||
/// <param name="TagConfigJsonSchema">JSON Schema for <c>TagConfig</c>; required for every driver since every driver has tags.</param>
|
||||
/// <param name="Tier">
|
||||
/// Stability tier per <c>docs/v2/driver-stability.md</c> §2-4 and <c>docs/v2/plan.md</c>
|
||||
/// decisions #63-74. Drives the shared resilience pipeline defaults
|
||||
/// (<see cref="Tier"/> × capability → <c>CapabilityPolicy</c>), the <c>MemoryTracking</c>
|
||||
/// hybrid-formula constants, and whether process-level <c>MemoryRecycle</c> / scheduled-
|
||||
/// recycle protections apply (Tier C only). Every registered driver type must declare one.
|
||||
/// </param>
|
||||
public sealed record DriverTypeMetadata(
|
||||
string TypeName,
|
||||
NamespaceKindCompatibility AllowedNamespaceKinds,
|
||||
string DriverConfigJsonSchema,
|
||||
string? DeviceConfigJsonSchema,
|
||||
string TagConfigJsonSchema);
|
||||
string TagConfigJsonSchema,
|
||||
DriverTier Tier);
|
||||
|
||||
/// <summary>Bitmask of namespace kinds a driver type may populate. Per decision #111.</summary>
|
||||
[Flags]
|
||||
|
||||
26
src/ZB.MOM.WW.OtOpcUa.Core.Abstractions/IDriverSupervisor.cs
Normal file
26
src/ZB.MOM.WW.OtOpcUa.Core.Abstractions/IDriverSupervisor.cs
Normal file
@@ -0,0 +1,26 @@
|
||||
namespace ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||
|
||||
/// <summary>
|
||||
/// Process-level supervisor contract a Tier C driver's out-of-process topology provides
|
||||
/// (e.g. <c>Driver.Galaxy.Proxy/Supervisor/</c>). Concerns: restart the Host process when a
|
||||
/// hard fault is detected (memory breach, wedge, scheduled recycle window).
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// Per <c>docs/v2/plan.md</c> decisions #68, #73-74, and #145. Tier A/B drivers do NOT have
|
||||
/// a supervisor because they run in-process — recycling would kill every OPC UA session and
|
||||
/// every co-hosted driver. The Core.Stability layer only invokes this interface for Tier C
|
||||
/// instances after asserting the tier via <see cref="DriverTypeMetadata.Tier"/>.
|
||||
/// </remarks>
|
||||
public interface IDriverSupervisor
|
||||
{
|
||||
/// <summary>Driver instance this supervisor governs.</summary>
|
||||
string DriverInstanceId { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Request the supervisor to recycle (terminate + restart) the Host process. Implementations
|
||||
/// are expected to be idempotent under repeat calls during an in-flight recycle.
|
||||
/// </summary>
|
||||
/// <param name="reason">Human-readable reason — flows into the supervisor's logs.</param>
|
||||
/// <param name="cancellationToken">Cancels the recycle request; an in-flight restart is not interrupted.</param>
|
||||
Task RecycleAsync(string reason, CancellationToken cancellationToken);
|
||||
}
|
||||
@@ -30,6 +30,52 @@ public interface IHistoryProvider
|
||||
TimeSpan interval,
|
||||
HistoryAggregateType aggregate,
|
||||
CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>
|
||||
/// Read one sample per requested timestamp — OPC UA HistoryReadAtTime service. The
|
||||
/// driver interpolates (or returns the prior-boundary sample) when no exact match
|
||||
/// exists. Optional; drivers that can't interpolate throw <see cref="NotSupportedException"/>.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// Default implementation throws. Drivers opt in by overriding; keeps existing
|
||||
/// <c>IHistoryProvider</c> implementations compiling without forcing a ReadAtTime path
|
||||
/// they may not have a backend for.
|
||||
/// </remarks>
|
||||
Task<HistoryReadResult> ReadAtTimeAsync(
|
||||
string fullReference,
|
||||
IReadOnlyList<DateTime> timestampsUtc,
|
||||
CancellationToken cancellationToken)
|
||||
=> throw new NotSupportedException(
|
||||
$"{GetType().Name} does not implement ReadAtTimeAsync. " +
|
||||
"Drivers whose backends support at-time reads override this method.");
|
||||
|
||||
/// <summary>
|
||||
/// Read historical alarm/event records — OPC UA HistoryReadEvents service. Distinct
|
||||
/// from the live event stream — historical rows come from an event historian (Galaxy's
|
||||
/// Alarm Provider history log, etc.) rather than the driver's active subscription.
|
||||
/// </summary>
|
||||
/// <param name="sourceName">
|
||||
/// Optional filter: null means "all sources", otherwise restrict to events from that
|
||||
/// source-object name. Drivers may ignore the filter if the backend doesn't support it.
|
||||
/// </param>
|
||||
/// <param name="startUtc">Inclusive lower bound on <c>EventTimeUtc</c>.</param>
|
||||
/// <param name="endUtc">Exclusive upper bound on <c>EventTimeUtc</c>.</param>
|
||||
/// <param name="maxEvents">Upper cap on returned events — the driver's backend enforces this.</param>
|
||||
/// <param name="cancellationToken">Request cancellation.</param>
|
||||
/// <remarks>
|
||||
/// Default implementation throws. Only drivers with an event historian (Galaxy via the
|
||||
/// Wonderware Alarm & Events log) override. Modbus / the OPC UA Client driver stay
|
||||
/// with the default and let callers see <c>BadHistoryOperationUnsupported</c>.
|
||||
/// </remarks>
|
||||
Task<HistoricalEventsResult> ReadEventsAsync(
|
||||
string? sourceName,
|
||||
DateTime startUtc,
|
||||
DateTime endUtc,
|
||||
int maxEvents,
|
||||
CancellationToken cancellationToken)
|
||||
=> throw new NotSupportedException(
|
||||
$"{GetType().Name} does not implement ReadEventsAsync. " +
|
||||
"Drivers whose backends have an event historian override this method.");
|
||||
}
|
||||
|
||||
/// <summary>Result of a HistoryRead call.</summary>
|
||||
@@ -48,3 +94,29 @@ public enum HistoryAggregateType
|
||||
Total,
|
||||
Count,
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// One row returned by <see cref="IHistoryProvider.ReadEventsAsync"/> — a historical
|
||||
/// alarm/event record, not the OPC UA live-event stream. Fields match the minimum set the
|
||||
/// Server needs to populate a <c>HistoryEventFieldList</c> for HistoryReadEvents responses.
|
||||
/// </summary>
|
||||
/// <param name="EventId">Stable unique id for the event — driver-specific format.</param>
|
||||
/// <param name="SourceName">Source object that emitted the event. May differ from the <c>sourceName</c> filter the caller passed (fuzzy matches).</param>
|
||||
/// <param name="EventTimeUtc">Process-side timestamp — when the event actually occurred.</param>
|
||||
/// <param name="ReceivedTimeUtc">Historian-side timestamp — when the historian persisted the row; may lag <paramref name="EventTimeUtc"/> by the historian's buffer flush cadence.</param>
|
||||
/// <param name="Message">Human-readable message text.</param>
|
||||
/// <param name="Severity">OPC UA severity (1-1000). Drivers map their native priority scale onto this range.</param>
|
||||
public sealed record HistoricalEvent(
|
||||
string EventId,
|
||||
string? SourceName,
|
||||
DateTime EventTimeUtc,
|
||||
DateTime ReceivedTimeUtc,
|
||||
string? Message,
|
||||
ushort Severity);
|
||||
|
||||
/// <summary>Result of a <see cref="IHistoryProvider.ReadEventsAsync"/> call.</summary>
|
||||
/// <param name="Events">Events in chronological order by <c>EventTimeUtc</c>.</param>
|
||||
/// <param name="ContinuationPoint">Opaque token for the next call when more events are available; null when complete.</param>
|
||||
public sealed record HistoricalEventsResult(
|
||||
IReadOnlyList<HistoricalEvent> Events,
|
||||
byte[]? ContinuationPoint);
|
||||
|
||||
@@ -0,0 +1,19 @@
|
||||
namespace ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||
|
||||
/// <summary>
|
||||
/// Opts a tag-definition record into auto-retry on <see cref="IWritable.WriteAsync"/> failures.
|
||||
/// Absence of this attribute means writes are <b>not</b> retried — a timed-out write may have
|
||||
/// already succeeded at the device, and replaying pulses, alarm acks, counter increments, or
|
||||
/// recipe-step advances can duplicate irreversible field actions.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// Per <c>docs/v2/plan.md</c> decisions #44, #45, and #143. Applied to tag-definition POCOs
|
||||
/// (e.g. <c>ModbusTagDefinition</c>, <c>S7TagDefinition</c>, OPC UA client tag rows) at the
|
||||
/// property or record level. The <c>CapabilityInvoker</c> in <c>ZB.MOM.WW.OtOpcUa.Core.Resilience</c>
|
||||
/// reads this attribute via reflection once at driver-init time and caches the result; no
|
||||
/// per-write reflection cost.
|
||||
/// </remarks>
|
||||
[AttributeUsage(AttributeTargets.Property | AttributeTargets.Class | AttributeTargets.Struct, AllowMultiple = false, Inherited = true)]
|
||||
public sealed class WriteIdempotentAttribute : Attribute
|
||||
{
|
||||
}
|
||||
106
src/ZB.MOM.WW.OtOpcUa.Core/Resilience/CapabilityInvoker.cs
Normal file
106
src/ZB.MOM.WW.OtOpcUa.Core/Resilience/CapabilityInvoker.cs
Normal file
@@ -0,0 +1,106 @@
|
||||
using Polly;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Core.Resilience;
|
||||
|
||||
/// <summary>
|
||||
/// Executes driver-capability calls through a shared Polly pipeline. One invoker per
|
||||
/// <c>(DriverInstance, IDriver)</c> pair; the underlying <see cref="DriverResiliencePipelineBuilder"/>
|
||||
/// is process-singleton so all invokers share its cache.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// Per <c>docs/v2/plan.md</c> decisions #143-144 and Phase 6.1 Stream A.3. The server's dispatch
|
||||
/// layer routes every capability call (<c>IReadable.ReadAsync</c>, <c>IWritable.WriteAsync</c>,
|
||||
/// <c>ITagDiscovery.DiscoverAsync</c>, <c>ISubscribable.SubscribeAsync/UnsubscribeAsync</c>,
|
||||
/// <c>IHostConnectivityProbe</c> probe loop, <c>IAlarmSource.SubscribeAlarmsAsync/AcknowledgeAsync</c>,
|
||||
/// and all four <c>IHistoryProvider</c> reads) through this invoker.
|
||||
/// </remarks>
|
||||
public sealed class CapabilityInvoker
|
||||
{
|
||||
private readonly DriverResiliencePipelineBuilder _builder;
|
||||
private readonly string _driverInstanceId;
|
||||
private readonly Func<DriverResilienceOptions> _optionsAccessor;
|
||||
|
||||
/// <summary>
|
||||
/// Construct an invoker for one driver instance.
|
||||
/// </summary>
|
||||
/// <param name="builder">Shared, process-singleton pipeline builder.</param>
|
||||
/// <param name="driverInstanceId">The <c>DriverInstance.Id</c> column value.</param>
|
||||
/// <param name="optionsAccessor">
|
||||
/// Snapshot accessor for the current resilience options. Invoked per call so Admin-edit +
|
||||
/// pipeline-invalidate can take effect without restarting the invoker.
|
||||
/// </param>
|
||||
public CapabilityInvoker(
|
||||
DriverResiliencePipelineBuilder builder,
|
||||
string driverInstanceId,
|
||||
Func<DriverResilienceOptions> optionsAccessor)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(builder);
|
||||
ArgumentNullException.ThrowIfNull(optionsAccessor);
|
||||
|
||||
_builder = builder;
|
||||
_driverInstanceId = driverInstanceId;
|
||||
_optionsAccessor = optionsAccessor;
|
||||
}
|
||||
|
||||
/// <summary>Execute a capability call returning a value, honoring the per-capability pipeline.</summary>
|
||||
/// <typeparam name="TResult">Return type of the underlying driver call.</typeparam>
|
||||
public async ValueTask<TResult> ExecuteAsync<TResult>(
|
||||
DriverCapability capability,
|
||||
string hostName,
|
||||
Func<CancellationToken, ValueTask<TResult>> callSite,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(callSite);
|
||||
|
||||
var pipeline = ResolvePipeline(capability, hostName);
|
||||
return await pipeline.ExecuteAsync(callSite, cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
/// <summary>Execute a void-returning capability call, honoring the per-capability pipeline.</summary>
|
||||
public async ValueTask ExecuteAsync(
|
||||
DriverCapability capability,
|
||||
string hostName,
|
||||
Func<CancellationToken, ValueTask> callSite,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(callSite);
|
||||
|
||||
var pipeline = ResolvePipeline(capability, hostName);
|
||||
await pipeline.ExecuteAsync(callSite, cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Execute a <see cref="DriverCapability.Write"/> call honoring <see cref="WriteIdempotentAttribute"/>
|
||||
/// semantics — if <paramref name="isIdempotent"/> is <c>false</c>, retries are disabled regardless
|
||||
/// of the tag-level configuration (the pipeline for a non-idempotent write never retries per
|
||||
/// decisions #44-45). If <c>true</c>, the call runs through the capability's pipeline which may
|
||||
/// retry when the tier configuration permits.
|
||||
/// </summary>
|
||||
public async ValueTask<TResult> ExecuteWriteAsync<TResult>(
|
||||
string hostName,
|
||||
bool isIdempotent,
|
||||
Func<CancellationToken, ValueTask<TResult>> callSite,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(callSite);
|
||||
|
||||
if (!isIdempotent)
|
||||
{
|
||||
var noRetryOptions = _optionsAccessor() with
|
||||
{
|
||||
CapabilityPolicies = new Dictionary<DriverCapability, CapabilityPolicy>
|
||||
{
|
||||
[DriverCapability.Write] = _optionsAccessor().Resolve(DriverCapability.Write) with { RetryCount = 0 },
|
||||
},
|
||||
};
|
||||
var pipeline = _builder.GetOrCreate(_driverInstanceId, $"{hostName}::non-idempotent", DriverCapability.Write, noRetryOptions);
|
||||
return await pipeline.ExecuteAsync(callSite, cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
return await ExecuteAsync(DriverCapability.Write, hostName, callSite, cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
private ResiliencePipeline ResolvePipeline(DriverCapability capability, string hostName) =>
|
||||
_builder.GetOrCreate(_driverInstanceId, hostName, capability, _optionsAccessor());
|
||||
}
|
||||
@@ -0,0 +1,96 @@
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Core.Resilience;
|
||||
|
||||
/// <summary>
|
||||
/// Per-tier × per-capability resilience policy configuration for a driver instance.
|
||||
/// Bound from <c>DriverInstance.ResilienceConfig</c> JSON (nullable column; null = tier defaults).
|
||||
/// Per <c>docs/v2/plan.md</c> decisions #143 and #144.
|
||||
/// </summary>
|
||||
public sealed record DriverResilienceOptions
|
||||
{
|
||||
/// <summary>Tier the owning driver type is registered as; drives the default map.</summary>
|
||||
public required DriverTier Tier { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Per-capability policy overrides. Capabilities absent from this map fall back to
|
||||
/// <see cref="GetTierDefaults(DriverTier)"/> for the configured <see cref="Tier"/>.
|
||||
/// </summary>
|
||||
public IReadOnlyDictionary<DriverCapability, CapabilityPolicy> CapabilityPolicies { get; init; }
|
||||
= new Dictionary<DriverCapability, CapabilityPolicy>();
|
||||
|
||||
/// <summary>Bulkhead (max concurrent in-flight calls) for every capability. Default 32.</summary>
|
||||
public int BulkheadMaxConcurrent { get; init; } = 32;
|
||||
|
||||
/// <summary>
|
||||
/// Bulkhead queue depth. Zero = no queueing; overflow fails fast with
|
||||
/// <c>BulkheadRejectedException</c>. Default 64.
|
||||
/// </summary>
|
||||
public int BulkheadMaxQueue { get; init; } = 64;
|
||||
|
||||
/// <summary>
|
||||
/// Look up the effective policy for a capability, falling back to tier defaults when no
|
||||
/// override is configured. Never returns null.
|
||||
/// </summary>
|
||||
public CapabilityPolicy Resolve(DriverCapability capability)
|
||||
{
|
||||
if (CapabilityPolicies.TryGetValue(capability, out var policy))
|
||||
return policy;
|
||||
|
||||
var defaults = GetTierDefaults(Tier);
|
||||
return defaults[capability];
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Per-tier per-capability default policy table, per decisions #143-144 and the Phase 6.1
|
||||
/// Stream A.2 specification. Retries skipped on <see cref="DriverCapability.Write"/> and
|
||||
/// <see cref="DriverCapability.AlarmAcknowledge"/> regardless of tier.
|
||||
/// </summary>
|
||||
public static IReadOnlyDictionary<DriverCapability, CapabilityPolicy> GetTierDefaults(DriverTier tier) =>
|
||||
tier switch
|
||||
{
|
||||
DriverTier.A => new Dictionary<DriverCapability, CapabilityPolicy>
|
||||
{
|
||||
[DriverCapability.Read] = new(TimeoutSeconds: 2, RetryCount: 3, BreakerFailureThreshold: 5),
|
||||
[DriverCapability.Write] = new(TimeoutSeconds: 2, RetryCount: 0, BreakerFailureThreshold: 5),
|
||||
[DriverCapability.Discover] = new(TimeoutSeconds: 30, RetryCount: 2, BreakerFailureThreshold: 3),
|
||||
[DriverCapability.Subscribe] = new(TimeoutSeconds: 5, RetryCount: 3, BreakerFailureThreshold: 5),
|
||||
[DriverCapability.Probe] = new(TimeoutSeconds: 2, RetryCount: 3, BreakerFailureThreshold: 5),
|
||||
[DriverCapability.AlarmSubscribe] = new(TimeoutSeconds: 5, RetryCount: 3, BreakerFailureThreshold: 5),
|
||||
[DriverCapability.AlarmAcknowledge] = new(TimeoutSeconds: 5, RetryCount: 0, BreakerFailureThreshold: 5),
|
||||
[DriverCapability.HistoryRead] = new(TimeoutSeconds: 30, RetryCount: 2, BreakerFailureThreshold: 5),
|
||||
},
|
||||
DriverTier.B => new Dictionary<DriverCapability, CapabilityPolicy>
|
||||
{
|
||||
[DriverCapability.Read] = new(TimeoutSeconds: 4, RetryCount: 3, BreakerFailureThreshold: 5),
|
||||
[DriverCapability.Write] = new(TimeoutSeconds: 4, RetryCount: 0, BreakerFailureThreshold: 5),
|
||||
[DriverCapability.Discover] = new(TimeoutSeconds: 60, RetryCount: 2, BreakerFailureThreshold: 3),
|
||||
[DriverCapability.Subscribe] = new(TimeoutSeconds: 8, RetryCount: 3, BreakerFailureThreshold: 5),
|
||||
[DriverCapability.Probe] = new(TimeoutSeconds: 4, RetryCount: 3, BreakerFailureThreshold: 5),
|
||||
[DriverCapability.AlarmSubscribe] = new(TimeoutSeconds: 8, RetryCount: 3, BreakerFailureThreshold: 5),
|
||||
[DriverCapability.AlarmAcknowledge] = new(TimeoutSeconds: 8, RetryCount: 0, BreakerFailureThreshold: 5),
|
||||
[DriverCapability.HistoryRead] = new(TimeoutSeconds: 60, RetryCount: 2, BreakerFailureThreshold: 5),
|
||||
},
|
||||
DriverTier.C => new Dictionary<DriverCapability, CapabilityPolicy>
|
||||
{
|
||||
[DriverCapability.Read] = new(TimeoutSeconds: 10, RetryCount: 1, BreakerFailureThreshold: 0),
|
||||
[DriverCapability.Write] = new(TimeoutSeconds: 10, RetryCount: 0, BreakerFailureThreshold: 0),
|
||||
[DriverCapability.Discover] = new(TimeoutSeconds: 120, RetryCount: 1, BreakerFailureThreshold: 0),
|
||||
[DriverCapability.Subscribe] = new(TimeoutSeconds: 15, RetryCount: 1, BreakerFailureThreshold: 0),
|
||||
[DriverCapability.Probe] = new(TimeoutSeconds: 10, RetryCount: 1, BreakerFailureThreshold: 0),
|
||||
[DriverCapability.AlarmSubscribe] = new(TimeoutSeconds: 15, RetryCount: 1, BreakerFailureThreshold: 0),
|
||||
[DriverCapability.AlarmAcknowledge] = new(TimeoutSeconds: 15, RetryCount: 0, BreakerFailureThreshold: 0),
|
||||
[DriverCapability.HistoryRead] = new(TimeoutSeconds: 120, RetryCount: 1, BreakerFailureThreshold: 0),
|
||||
},
|
||||
_ => throw new ArgumentOutOfRangeException(nameof(tier), tier, $"No default policy table defined for tier {tier}."),
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>Policy for one capability on one driver instance.</summary>
|
||||
/// <param name="TimeoutSeconds">Per-call timeout (wraps the inner Polly execution).</param>
|
||||
/// <param name="RetryCount">Number of retry attempts after the first failure; zero = no retry.</param>
|
||||
/// <param name="BreakerFailureThreshold">
|
||||
/// Consecutive-failure count that opens the circuit breaker; zero = no breaker
|
||||
/// (Tier C uses the supervisor's process-level breaker instead, per decision #68).
|
||||
/// </param>
|
||||
public sealed record CapabilityPolicy(int TimeoutSeconds, int RetryCount, int BreakerFailureThreshold);
|
||||
@@ -0,0 +1,118 @@
|
||||
using System.Collections.Concurrent;
|
||||
using Polly;
|
||||
using Polly.CircuitBreaker;
|
||||
using Polly.Retry;
|
||||
using Polly.Timeout;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Core.Resilience;
|
||||
|
||||
/// <summary>
|
||||
/// Builds and caches Polly resilience pipelines keyed on
|
||||
/// <c>(DriverInstanceId, HostName, DriverCapability)</c>. One dead PLC behind a multi-device
|
||||
/// driver cannot open the circuit breaker for healthy sibling hosts.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// Per <c>docs/v2/plan.md</c> decision #144 (per-device isolation). Composition from outside-in:
|
||||
/// <b>Timeout → Retry (when capability permits) → Circuit Breaker (when tier permits) → Bulkhead</b>.
|
||||
///
|
||||
/// <para>Pipeline resolution is lock-free on the hot path: the inner
|
||||
/// <see cref="ConcurrentDictionary{TKey,TValue}"/> caches a <see cref="ResiliencePipeline"/> per key;
|
||||
/// first-call cost is one <see cref="ResiliencePipelineBuilder"/>.Build. Thereafter reads are O(1).</para>
|
||||
/// </remarks>
|
||||
public sealed class DriverResiliencePipelineBuilder
|
||||
{
|
||||
private readonly ConcurrentDictionary<PipelineKey, ResiliencePipeline> _pipelines = new();
|
||||
private readonly TimeProvider _timeProvider;
|
||||
|
||||
/// <summary>Construct with the ambient clock (use <see cref="TimeProvider.System"/> in prod).</summary>
|
||||
public DriverResiliencePipelineBuilder(TimeProvider? timeProvider = null)
|
||||
{
|
||||
_timeProvider = timeProvider ?? TimeProvider.System;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Get or build the pipeline for a given <c>(driver instance, host, capability)</c> triple.
|
||||
/// Calls with the same key + same options reuse the same pipeline instance; the first caller
|
||||
/// wins if a race occurs (both pipelines would be behaviourally identical).
|
||||
/// </summary>
|
||||
/// <param name="driverInstanceId">DriverInstance primary key — opaque to this layer.</param>
|
||||
/// <param name="hostName">
|
||||
/// Host the call targets. For single-host drivers (Galaxy, some OPC UA Client configs) pass the
|
||||
/// driver's canonical host string. For multi-host drivers (Modbus with N PLCs), pass the
|
||||
/// specific PLC so one dead PLC doesn't poison healthy siblings.
|
||||
/// </param>
|
||||
/// <param name="capability">Which capability surface is being called.</param>
|
||||
/// <param name="options">Per-driver-instance options (tier + per-capability overrides).</param>
|
||||
public ResiliencePipeline GetOrCreate(
|
||||
string driverInstanceId,
|
||||
string hostName,
|
||||
DriverCapability capability,
|
||||
DriverResilienceOptions options)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(options);
|
||||
ArgumentException.ThrowIfNullOrWhiteSpace(hostName);
|
||||
|
||||
var key = new PipelineKey(driverInstanceId, hostName, capability);
|
||||
return _pipelines.GetOrAdd(key, static (_, state) => Build(state.capability, state.options, state.timeProvider),
|
||||
(capability, options, timeProvider: _timeProvider));
|
||||
}
|
||||
|
||||
/// <summary>Drop cached pipelines for one driver instance (e.g. on ResilienceConfig change). Test + Admin-reload use.</summary>
|
||||
public int Invalidate(string driverInstanceId)
|
||||
{
|
||||
var removed = 0;
|
||||
foreach (var key in _pipelines.Keys)
|
||||
{
|
||||
if (key.DriverInstanceId == driverInstanceId && _pipelines.TryRemove(key, out _))
|
||||
removed++;
|
||||
}
|
||||
return removed;
|
||||
}
|
||||
|
||||
/// <summary>Snapshot of the current number of cached pipelines. For diagnostics only.</summary>
|
||||
public int CachedPipelineCount => _pipelines.Count;
|
||||
|
||||
private static ResiliencePipeline Build(
|
||||
DriverCapability capability,
|
||||
DriverResilienceOptions options,
|
||||
TimeProvider timeProvider)
|
||||
{
|
||||
var policy = options.Resolve(capability);
|
||||
var builder = new ResiliencePipelineBuilder { TimeProvider = timeProvider };
|
||||
|
||||
builder.AddTimeout(new TimeoutStrategyOptions
|
||||
{
|
||||
Timeout = TimeSpan.FromSeconds(policy.TimeoutSeconds),
|
||||
});
|
||||
|
||||
if (policy.RetryCount > 0)
|
||||
{
|
||||
builder.AddRetry(new RetryStrategyOptions
|
||||
{
|
||||
MaxRetryAttempts = policy.RetryCount,
|
||||
BackoffType = DelayBackoffType.Exponential,
|
||||
UseJitter = true,
|
||||
Delay = TimeSpan.FromMilliseconds(100),
|
||||
MaxDelay = TimeSpan.FromSeconds(5),
|
||||
ShouldHandle = new PredicateBuilder().Handle<Exception>(ex => ex is not OperationCanceledException),
|
||||
});
|
||||
}
|
||||
|
||||
if (policy.BreakerFailureThreshold > 0)
|
||||
{
|
||||
builder.AddCircuitBreaker(new CircuitBreakerStrategyOptions
|
||||
{
|
||||
FailureRatio = 1.0,
|
||||
MinimumThroughput = policy.BreakerFailureThreshold,
|
||||
SamplingDuration = TimeSpan.FromSeconds(30),
|
||||
BreakDuration = TimeSpan.FromSeconds(15),
|
||||
ShouldHandle = new PredicateBuilder().Handle<Exception>(ex => ex is not OperationCanceledException),
|
||||
});
|
||||
}
|
||||
|
||||
return builder.Build();
|
||||
}
|
||||
|
||||
private readonly record struct PipelineKey(string DriverInstanceId, string HostName, DriverCapability Capability);
|
||||
}
|
||||
65
src/ZB.MOM.WW.OtOpcUa.Core/Stability/MemoryRecycle.cs
Normal file
65
src/ZB.MOM.WW.OtOpcUa.Core/Stability/MemoryRecycle.cs
Normal file
@@ -0,0 +1,65 @@
|
||||
using Microsoft.Extensions.Logging;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Core.Stability;
|
||||
|
||||
/// <summary>
|
||||
/// Tier C only process-recycle companion to <see cref="MemoryTracking"/>. On a
|
||||
/// <see cref="MemoryTrackingAction.HardBreach"/> signal, invokes the supplied
|
||||
/// <see cref="IDriverSupervisor"/> to restart the out-of-process Host.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// Per <c>docs/v2/plan.md</c> decisions #74 and #145. Tier A/B hard-breach on an in-process
|
||||
/// driver would kill every OPC UA session and every co-hosted driver, so for Tier A/B this
|
||||
/// class logs a <b>promotion-to-Tier-C recommendation</b> and does NOT invoke any supervisor.
|
||||
/// A future tier-migration workflow acts on the recommendation.
|
||||
/// </remarks>
|
||||
public sealed class MemoryRecycle
|
||||
{
|
||||
private readonly DriverTier _tier;
|
||||
private readonly IDriverSupervisor? _supervisor;
|
||||
private readonly ILogger<MemoryRecycle> _logger;
|
||||
|
||||
public MemoryRecycle(DriverTier tier, IDriverSupervisor? supervisor, ILogger<MemoryRecycle> logger)
|
||||
{
|
||||
_tier = tier;
|
||||
_supervisor = supervisor;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Handle a <see cref="MemoryTracking"/> classification for the driver. For Tier C with a
|
||||
/// wired supervisor, <c>HardBreach</c> triggers <see cref="IDriverSupervisor.RecycleAsync"/>.
|
||||
/// All other combinations are no-ops with respect to process state (soft breaches + Tier A/B
|
||||
/// hard breaches just log).
|
||||
/// </summary>
|
||||
/// <returns>True when a recycle was requested; false otherwise.</returns>
|
||||
public async Task<bool> HandleAsync(MemoryTrackingAction action, long footprintBytes, CancellationToken cancellationToken)
|
||||
{
|
||||
switch (action)
|
||||
{
|
||||
case MemoryTrackingAction.SoftBreach:
|
||||
_logger.LogWarning(
|
||||
"Memory soft-breach on driver {DriverId}: footprint={Footprint:N0} bytes, tier={Tier}. Surfaced to Admin; no action.",
|
||||
_supervisor?.DriverInstanceId ?? "(unknown)", footprintBytes, _tier);
|
||||
return false;
|
||||
|
||||
case MemoryTrackingAction.HardBreach when _tier == DriverTier.C && _supervisor is not null:
|
||||
_logger.LogError(
|
||||
"Memory hard-breach on Tier C driver {DriverId}: footprint={Footprint:N0} bytes. Requesting supervisor recycle.",
|
||||
_supervisor.DriverInstanceId, footprintBytes);
|
||||
await _supervisor.RecycleAsync($"Memory hard-breach: {footprintBytes} bytes", cancellationToken).ConfigureAwait(false);
|
||||
return true;
|
||||
|
||||
case MemoryTrackingAction.HardBreach:
|
||||
_logger.LogError(
|
||||
"Memory hard-breach on Tier {Tier} in-process driver {DriverId}: footprint={Footprint:N0} bytes. " +
|
||||
"Recommending promotion to Tier C; NOT auto-killing (decisions #74, #145).",
|
||||
_tier, _supervisor?.DriverInstanceId ?? "(unknown)", footprintBytes);
|
||||
return false;
|
||||
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
136
src/ZB.MOM.WW.OtOpcUa.Core/Stability/MemoryTracking.cs
Normal file
136
src/ZB.MOM.WW.OtOpcUa.Core/Stability/MemoryTracking.cs
Normal file
@@ -0,0 +1,136 @@
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Core.Stability;
|
||||
|
||||
/// <summary>
|
||||
/// Tier-agnostic memory-footprint tracker. Captures the post-initialize <b>baseline</b>
|
||||
/// from the first samples after <c>IDriver.InitializeAsync</c>, then classifies each
|
||||
/// subsequent sample against a hybrid soft/hard threshold per
|
||||
/// <c>docs/v2/plan.md</c> decision #146 — <c>soft = max(multiplier × baseline, baseline + floor)</c>,
|
||||
/// <c>hard = 2 × soft</c>.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>Per decision #145, this tracker <b>never kills a process</b>. Soft and hard breaches
|
||||
/// log + surface to the Admin UI via <c>DriverInstanceResilienceStatus</c>. The matching
|
||||
/// process-level recycle protection lives in a separate <c>MemoryRecycle</c> that activates
|
||||
/// for Tier C drivers only (where the driver runs out-of-process behind a supervisor that
|
||||
/// can safely restart it without tearing down the OPC UA session or co-hosted in-proc
|
||||
/// drivers).</para>
|
||||
///
|
||||
/// <para>Baseline capture: the tracker starts in <see cref="TrackingPhase.WarmingUp"/> for
|
||||
/// <see cref="BaselineWindow"/> (default 5 min). During that window samples are collected;
|
||||
/// the baseline is computed as the median once the window elapses. Before that point every
|
||||
/// classification returns <see cref="MemoryTrackingAction.Warming"/>.</para>
|
||||
/// </remarks>
|
||||
public sealed class MemoryTracking
|
||||
{
|
||||
private readonly DriverTier _tier;
|
||||
private readonly TimeSpan _baselineWindow;
|
||||
private readonly List<long> _warmupSamples = [];
|
||||
private long _baselineBytes;
|
||||
private TrackingPhase _phase = TrackingPhase.WarmingUp;
|
||||
private DateTime? _warmupStartUtc;
|
||||
|
||||
/// <summary>Tier-default multiplier/floor constants per decision #146.</summary>
|
||||
public static (int Multiplier, long FloorBytes) GetTierConstants(DriverTier tier) => tier switch
|
||||
{
|
||||
DriverTier.A => (Multiplier: 3, FloorBytes: 50L * 1024 * 1024),
|
||||
DriverTier.B => (Multiplier: 3, FloorBytes: 100L * 1024 * 1024),
|
||||
DriverTier.C => (Multiplier: 2, FloorBytes: 500L * 1024 * 1024),
|
||||
_ => throw new ArgumentOutOfRangeException(nameof(tier), tier, $"No memory-tracking constants defined for tier {tier}."),
|
||||
};
|
||||
|
||||
/// <summary>Window over which post-init samples are collected to compute the baseline.</summary>
|
||||
public TimeSpan BaselineWindow => _baselineWindow;
|
||||
|
||||
/// <summary>Current phase: <see cref="TrackingPhase.WarmingUp"/> or <see cref="TrackingPhase.Steady"/>.</summary>
|
||||
public TrackingPhase Phase => _phase;
|
||||
|
||||
/// <summary>Captured baseline; 0 until warmup completes.</summary>
|
||||
public long BaselineBytes => _baselineBytes;
|
||||
|
||||
/// <summary>Effective soft threshold (zero while warming up).</summary>
|
||||
public long SoftThresholdBytes => _baselineBytes == 0 ? 0 : ComputeSoft(_tier, _baselineBytes);
|
||||
|
||||
/// <summary>Effective hard threshold = 2 × soft (zero while warming up).</summary>
|
||||
public long HardThresholdBytes => _baselineBytes == 0 ? 0 : ComputeSoft(_tier, _baselineBytes) * 2;
|
||||
|
||||
public MemoryTracking(DriverTier tier, TimeSpan? baselineWindow = null)
|
||||
{
|
||||
_tier = tier;
|
||||
_baselineWindow = baselineWindow ?? TimeSpan.FromMinutes(5);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Submit a memory-footprint sample. Returns the action the caller should surface.
|
||||
/// During warmup, always returns <see cref="MemoryTrackingAction.Warming"/> and accumulates
|
||||
/// samples; once the window elapses the first steady-phase sample triggers baseline capture
|
||||
/// (median of warmup samples).
|
||||
/// </summary>
|
||||
public MemoryTrackingAction Sample(long footprintBytes, DateTime utcNow)
|
||||
{
|
||||
if (_phase == TrackingPhase.WarmingUp)
|
||||
{
|
||||
_warmupStartUtc ??= utcNow;
|
||||
_warmupSamples.Add(footprintBytes);
|
||||
if (utcNow - _warmupStartUtc.Value >= _baselineWindow && _warmupSamples.Count > 0)
|
||||
{
|
||||
_baselineBytes = ComputeMedian(_warmupSamples);
|
||||
_phase = TrackingPhase.Steady;
|
||||
}
|
||||
else
|
||||
{
|
||||
return MemoryTrackingAction.Warming;
|
||||
}
|
||||
}
|
||||
|
||||
if (footprintBytes >= HardThresholdBytes) return MemoryTrackingAction.HardBreach;
|
||||
if (footprintBytes >= SoftThresholdBytes) return MemoryTrackingAction.SoftBreach;
|
||||
return MemoryTrackingAction.None;
|
||||
}
|
||||
|
||||
private static long ComputeSoft(DriverTier tier, long baseline)
|
||||
{
|
||||
var (multiplier, floor) = GetTierConstants(tier);
|
||||
return Math.Max(multiplier * baseline, baseline + floor);
|
||||
}
|
||||
|
||||
private static long ComputeMedian(List<long> samples)
|
||||
{
|
||||
var sorted = samples.Order().ToArray();
|
||||
var mid = sorted.Length / 2;
|
||||
return sorted.Length % 2 == 1
|
||||
? sorted[mid]
|
||||
: (sorted[mid - 1] + sorted[mid]) / 2;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>Phase of a <see cref="MemoryTracking"/> lifecycle.</summary>
|
||||
public enum TrackingPhase
|
||||
{
|
||||
/// <summary>Collecting post-init samples; baseline not yet computed.</summary>
|
||||
WarmingUp,
|
||||
|
||||
/// <summary>Baseline captured; every sample classified against soft/hard thresholds.</summary>
|
||||
Steady,
|
||||
}
|
||||
|
||||
/// <summary>Classification the tracker returns per sample.</summary>
|
||||
public enum MemoryTrackingAction
|
||||
{
|
||||
/// <summary>Baseline not yet captured; sample collected, no threshold check.</summary>
|
||||
Warming,
|
||||
|
||||
/// <summary>Below soft threshold.</summary>
|
||||
None,
|
||||
|
||||
/// <summary>Between soft and hard thresholds — log + surface, no action.</summary>
|
||||
SoftBreach,
|
||||
|
||||
/// <summary>
|
||||
/// ≥ hard threshold. Log + surface + (Tier C only, via <c>MemoryRecycle</c>) request
|
||||
/// process recycle via the driver supervisor. Tier A/B breach never invokes any
|
||||
/// kill path per decisions #145 and #74.
|
||||
/// </summary>
|
||||
HardBreach,
|
||||
}
|
||||
@@ -0,0 +1,86 @@
|
||||
using Microsoft.Extensions.Logging;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Core.Stability;
|
||||
|
||||
/// <summary>
|
||||
/// Tier C opt-in periodic-recycle driver per <c>docs/v2/plan.md</c> decision #67.
|
||||
/// A tick method advanced by the caller (fed by a background timer in prod; by test clock
|
||||
/// in unit tests) decides whether the configured interval has elapsed and, if so, drives the
|
||||
/// supplied <see cref="IDriverSupervisor"/> to recycle the Host.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// Tier A/B drivers MUST NOT use this class — scheduled recycle for in-process drivers would
|
||||
/// kill every OPC UA session and every co-hosted driver. The ctor throws when constructed
|
||||
/// with any tier other than C to make the misuse structurally impossible.
|
||||
///
|
||||
/// <para>Keeps no background thread of its own — callers invoke <see cref="TickAsync"/> on
|
||||
/// their ambient scheduler tick (Phase 6.1 Stream C's health-endpoint host runs one). That
|
||||
/// decouples the unit under test from wall-clock time and thread-pool scheduling.</para>
|
||||
/// </remarks>
|
||||
public sealed class ScheduledRecycleScheduler
|
||||
{
|
||||
private readonly TimeSpan _recycleInterval;
|
||||
private readonly IDriverSupervisor _supervisor;
|
||||
private readonly ILogger<ScheduledRecycleScheduler> _logger;
|
||||
private DateTime _nextRecycleUtc;
|
||||
|
||||
/// <summary>
|
||||
/// Construct the scheduler for a Tier C driver. Throws if <paramref name="tier"/> isn't C.
|
||||
/// </summary>
|
||||
/// <param name="tier">Driver tier; must be <see cref="DriverTier.C"/>.</param>
|
||||
/// <param name="recycleInterval">Interval between recycles (e.g. 7 days).</param>
|
||||
/// <param name="startUtc">Anchor time; next recycle fires at <paramref name="startUtc"/> + <paramref name="recycleInterval"/>.</param>
|
||||
/// <param name="supervisor">Supervisor that performs the actual recycle.</param>
|
||||
/// <param name="logger">Diagnostic sink.</param>
|
||||
public ScheduledRecycleScheduler(
|
||||
DriverTier tier,
|
||||
TimeSpan recycleInterval,
|
||||
DateTime startUtc,
|
||||
IDriverSupervisor supervisor,
|
||||
ILogger<ScheduledRecycleScheduler> logger)
|
||||
{
|
||||
if (tier != DriverTier.C)
|
||||
throw new ArgumentException(
|
||||
$"ScheduledRecycleScheduler is Tier C only (got {tier}). " +
|
||||
"In-process drivers must not use scheduled recycle; see decisions #74 and #145.",
|
||||
nameof(tier));
|
||||
|
||||
if (recycleInterval <= TimeSpan.Zero)
|
||||
throw new ArgumentException("RecycleInterval must be positive.", nameof(recycleInterval));
|
||||
|
||||
_recycleInterval = recycleInterval;
|
||||
_supervisor = supervisor;
|
||||
_logger = logger;
|
||||
_nextRecycleUtc = startUtc + recycleInterval;
|
||||
}
|
||||
|
||||
/// <summary>Next scheduled recycle UTC. Advances by <see cref="RecycleInterval"/> on each fire.</summary>
|
||||
public DateTime NextRecycleUtc => _nextRecycleUtc;
|
||||
|
||||
/// <summary>Recycle interval this scheduler was constructed with.</summary>
|
||||
public TimeSpan RecycleInterval => _recycleInterval;
|
||||
|
||||
/// <summary>
|
||||
/// Tick the scheduler forward. If <paramref name="utcNow"/> is past
|
||||
/// <see cref="NextRecycleUtc"/>, requests a recycle from the supervisor and advances
|
||||
/// <see cref="NextRecycleUtc"/> by exactly one interval. Returns true when a recycle fired.
|
||||
/// </summary>
|
||||
public async Task<bool> TickAsync(DateTime utcNow, CancellationToken cancellationToken)
|
||||
{
|
||||
if (utcNow < _nextRecycleUtc)
|
||||
return false;
|
||||
|
||||
_logger.LogInformation(
|
||||
"Scheduled recycle due for Tier C driver {DriverId} at {Now:o}; advancing next to {Next:o}.",
|
||||
_supervisor.DriverInstanceId, utcNow, _nextRecycleUtc + _recycleInterval);
|
||||
|
||||
await _supervisor.RecycleAsync("Scheduled periodic recycle", cancellationToken).ConfigureAwait(false);
|
||||
_nextRecycleUtc += _recycleInterval;
|
||||
return true;
|
||||
}
|
||||
|
||||
/// <summary>Request an immediate recycle outside the schedule (e.g. MemoryRecycle hard-breach escalation).</summary>
|
||||
public Task RequestRecycleNowAsync(string reason, CancellationToken cancellationToken) =>
|
||||
_supervisor.RecycleAsync(reason, cancellationToken);
|
||||
}
|
||||
81
src/ZB.MOM.WW.OtOpcUa.Core/Stability/WedgeDetector.cs
Normal file
81
src/ZB.MOM.WW.OtOpcUa.Core/Stability/WedgeDetector.cs
Normal file
@@ -0,0 +1,81 @@
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Core.Stability;
|
||||
|
||||
/// <summary>
|
||||
/// Demand-aware driver-wedge detector per <c>docs/v2/plan.md</c> decision #147.
|
||||
/// Flips a driver to <see cref="WedgeVerdict.Faulted"/> only when BOTH of the following hold:
|
||||
/// (a) there is pending work outstanding, AND (b) no progress has been observed for longer
|
||||
/// than <see cref="Threshold"/>. Idle drivers, write-only burst drivers, and subscription-only
|
||||
/// drivers whose signals don't arrive regularly all stay Healthy.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>Pending work signal is supplied by the caller via <see cref="DemandSignal"/>:
|
||||
/// non-zero Polly bulkhead depth, ≥1 active MonitoredItem, or ≥1 queued historian read
|
||||
/// each qualifies. The detector itself is state-light: all it remembers is the last
|
||||
/// <c>LastProgressUtc</c> it saw and the last wedge verdict. No history buffer.</para>
|
||||
///
|
||||
/// <para>Default threshold per plan: <c>5 × PublishingInterval</c>, with a minimum of 60 s.
|
||||
/// Concrete values are driver-agnostic and configured per-instance by the caller.</para>
|
||||
/// </remarks>
|
||||
public sealed class WedgeDetector
|
||||
{
|
||||
/// <summary>Wedge-detection threshold; pass < 60 s and the detector clamps to 60 s.</summary>
|
||||
public TimeSpan Threshold { get; }
|
||||
|
||||
/// <summary>Whether the driver reported itself <see cref="DriverState.Healthy"/> at construction.</summary>
|
||||
public WedgeDetector(TimeSpan threshold)
|
||||
{
|
||||
Threshold = threshold < TimeSpan.FromSeconds(60) ? TimeSpan.FromSeconds(60) : threshold;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Classify the current state against the demand signal. Does not retain state across
|
||||
/// calls — each call is self-contained; the caller owns the <c>LastProgressUtc</c> clock.
|
||||
/// </summary>
|
||||
public WedgeVerdict Classify(DriverState state, DemandSignal demand, DateTime utcNow)
|
||||
{
|
||||
if (state != DriverState.Healthy)
|
||||
return WedgeVerdict.NotApplicable;
|
||||
|
||||
if (!demand.HasPendingWork)
|
||||
return WedgeVerdict.Idle;
|
||||
|
||||
var sinceProgress = utcNow - demand.LastProgressUtc;
|
||||
return sinceProgress > Threshold ? WedgeVerdict.Faulted : WedgeVerdict.Healthy;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Caller-supplied demand snapshot. All three counters are OR'd — any non-zero means work
|
||||
/// is outstanding, which is the trigger for checking the <see cref="LastProgressUtc"/> clock.
|
||||
/// </summary>
|
||||
/// <param name="BulkheadDepth">Polly bulkhead depth (in-flight capability calls).</param>
|
||||
/// <param name="ActiveMonitoredItems">Number of live OPC UA MonitoredItems bound to this driver.</param>
|
||||
/// <param name="QueuedHistoryReads">Pending historian-read requests the driver owes the server.</param>
|
||||
/// <param name="LastProgressUtc">Last time the driver reported a successful unit of work (read, subscribe-ack, publish).</param>
|
||||
public readonly record struct DemandSignal(
|
||||
int BulkheadDepth,
|
||||
int ActiveMonitoredItems,
|
||||
int QueuedHistoryReads,
|
||||
DateTime LastProgressUtc)
|
||||
{
|
||||
/// <summary>True when any of the three counters is > 0.</summary>
|
||||
public bool HasPendingWork => BulkheadDepth > 0 || ActiveMonitoredItems > 0 || QueuedHistoryReads > 0;
|
||||
}
|
||||
|
||||
/// <summary>Outcome of a single <see cref="WedgeDetector.Classify"/> call.</summary>
|
||||
public enum WedgeVerdict
|
||||
{
|
||||
/// <summary>Driver wasn't Healthy to begin with — wedge detection doesn't apply.</summary>
|
||||
NotApplicable,
|
||||
|
||||
/// <summary>Driver claims Healthy + no pending work → stays Healthy.</summary>
|
||||
Idle,
|
||||
|
||||
/// <summary>Driver claims Healthy + has pending work + has made progress within the threshold → stays Healthy.</summary>
|
||||
Healthy,
|
||||
|
||||
/// <summary>Driver claims Healthy + has pending work + has NOT made progress within the threshold → wedged.</summary>
|
||||
Faulted,
|
||||
}
|
||||
@@ -16,6 +16,10 @@
|
||||
<ProjectReference Include="..\ZB.MOM.WW.OtOpcUa.Configuration\ZB.MOM.WW.OtOpcUa.Configuration.csproj"/>
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<PackageReference Include="Polly.Core" Version="8.6.6"/>
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<InternalsVisibleTo Include="ZB.MOM.WW.OtOpcUa.Core.Tests"/>
|
||||
</ItemGroup>
|
||||
|
||||
@@ -339,6 +339,64 @@ public sealed class GalaxyProxyDriver(GalaxyProxyOptions options)
|
||||
return new HistoryReadResult(samples, ContinuationPoint: null);
|
||||
}
|
||||
|
||||
public async Task<HistoryReadResult> ReadAtTimeAsync(
|
||||
string fullReference, IReadOnlyList<DateTime> timestampsUtc, CancellationToken cancellationToken)
|
||||
{
|
||||
var client = RequireClient();
|
||||
var resp = await client.CallAsync<HistoryReadAtTimeRequest, HistoryReadAtTimeResponse>(
|
||||
MessageKind.HistoryReadAtTimeRequest,
|
||||
new HistoryReadAtTimeRequest
|
||||
{
|
||||
SessionId = _sessionId,
|
||||
TagReference = fullReference,
|
||||
TimestampsUtcUnixMs = [.. timestampsUtc.Select(t => new DateTimeOffset(t, TimeSpan.Zero).ToUnixTimeMilliseconds())],
|
||||
},
|
||||
MessageKind.HistoryReadAtTimeResponse,
|
||||
cancellationToken);
|
||||
|
||||
if (!resp.Success)
|
||||
throw new InvalidOperationException($"Galaxy.Host HistoryReadAtTime failed: {resp.Error}");
|
||||
|
||||
// ReadAtTime returns one sample per requested timestamp in the same order — the Host
|
||||
// pads with bad-quality snapshots when a timestamp can't be interpolated, so response
|
||||
// length matches request length exactly. We trust that contract rather than
|
||||
// re-aligning here, because the Host is the source-of-truth for interpolation policy.
|
||||
IReadOnlyList<DataValueSnapshot> samples = [.. resp.Values.Select(ToSnapshot)];
|
||||
return new HistoryReadResult(samples, ContinuationPoint: null);
|
||||
}
|
||||
|
||||
public async Task<HistoricalEventsResult> ReadEventsAsync(
|
||||
string? sourceName, DateTime startUtc, DateTime endUtc, int maxEvents, CancellationToken cancellationToken)
|
||||
{
|
||||
var client = RequireClient();
|
||||
var resp = await client.CallAsync<HistoryReadEventsRequest, HistoryReadEventsResponse>(
|
||||
MessageKind.HistoryReadEventsRequest,
|
||||
new HistoryReadEventsRequest
|
||||
{
|
||||
SessionId = _sessionId,
|
||||
SourceName = sourceName,
|
||||
StartUtcUnixMs = new DateTimeOffset(startUtc, TimeSpan.Zero).ToUnixTimeMilliseconds(),
|
||||
EndUtcUnixMs = new DateTimeOffset(endUtc, TimeSpan.Zero).ToUnixTimeMilliseconds(),
|
||||
MaxEvents = maxEvents,
|
||||
},
|
||||
MessageKind.HistoryReadEventsResponse,
|
||||
cancellationToken);
|
||||
|
||||
if (!resp.Success)
|
||||
throw new InvalidOperationException($"Galaxy.Host HistoryReadEvents failed: {resp.Error}");
|
||||
|
||||
IReadOnlyList<HistoricalEvent> events = [.. resp.Events.Select(ToHistoricalEvent)];
|
||||
return new HistoricalEventsResult(events, ContinuationPoint: null);
|
||||
}
|
||||
|
||||
internal static HistoricalEvent ToHistoricalEvent(GalaxyHistoricalEvent wire) => new(
|
||||
EventId: wire.EventId,
|
||||
SourceName: wire.SourceName,
|
||||
EventTimeUtc: DateTimeOffset.FromUnixTimeMilliseconds(wire.EventTimeUtcUnixMs).UtcDateTime,
|
||||
ReceivedTimeUtc: DateTimeOffset.FromUnixTimeMilliseconds(wire.ReceivedTimeUtcUnixMs).UtcDateTime,
|
||||
Message: wire.DisplayText,
|
||||
Severity: wire.Severity);
|
||||
|
||||
/// <summary>
|
||||
/// Maps the OPC UA Part 13 aggregate enum onto the Wonderware Historian
|
||||
/// AnalogSummaryQuery column names consumed by <c>HistorianDataSource.ReadAggregateAsync</c>.
|
||||
|
||||
165
src/ZB.MOM.WW.OtOpcUa.Driver.Modbus/DirectLogicAddress.cs
Normal file
165
src/ZB.MOM.WW.OtOpcUa.Driver.Modbus/DirectLogicAddress.cs
Normal file
@@ -0,0 +1,165 @@
|
||||
namespace ZB.MOM.WW.OtOpcUa.Driver.Modbus;
|
||||
|
||||
/// <summary>
|
||||
/// AutomationDirect DirectLOGIC address-translation helpers. DL205 / DL260 / DL350 CPUs
|
||||
/// address V-memory in OCTAL while the Modbus wire uses DECIMAL PDU addresses — operators
|
||||
/// see "V2000" in the PLC ladder-logic editor but the Modbus client must write PDU 0x0400.
|
||||
/// The formulas differ between user V-memory (simple octal-to-decimal) and system V-memory
|
||||
/// (fixed bank mappings), so the two cases are separate methods rather than one overloaded
|
||||
/// "ToPdu" call.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// See <c>docs/v2/dl205.md</c> §V-memory for the full CPU-family matrix + rationale.
|
||||
/// References: D2-USER-M appendix (DL205/D2-260), H2-ECOM-M §6.5 (absolute vs relative
|
||||
/// addressing), AutomationDirect forum guidance on V40400 system-base.
|
||||
/// </remarks>
|
||||
public static class DirectLogicAddress
|
||||
{
|
||||
/// <summary>
|
||||
/// Convert a DirectLOGIC user V-memory address (octal) to a 0-based Modbus PDU address.
|
||||
/// Accepts bare octal (<c>"2000"</c>) or <c>V</c>-prefixed (<c>"V2000"</c>). Range
|
||||
/// depends on CPU model — DL205 D2-260 user memory is V1400-V7377 + V10000-V17777
|
||||
/// octal, DL260 extends to V77777 octal.
|
||||
/// </summary>
|
||||
/// <exception cref="ArgumentException">Input is null / empty / contains non-octal digits (8,9).</exception>
|
||||
/// <exception cref="OverflowException">Parsed value exceeds ushort.MaxValue (0xFFFF).</exception>
|
||||
public static ushort UserVMemoryToPdu(string vAddress)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(vAddress))
|
||||
throw new ArgumentException("V-memory address must not be empty", nameof(vAddress));
|
||||
var s = vAddress.Trim();
|
||||
if (s[0] == 'V' || s[0] == 'v') s = s.Substring(1);
|
||||
if (s.Length == 0)
|
||||
throw new ArgumentException($"V-memory address '{vAddress}' has no digits", nameof(vAddress));
|
||||
|
||||
// Octal conversion. Reject 8/9 digits up-front — int.Parse in the obvious base would
|
||||
// accept them silently because .NET has no built-in base-8 parser.
|
||||
uint result = 0;
|
||||
foreach (var ch in s)
|
||||
{
|
||||
if (ch < '0' || ch > '7')
|
||||
throw new ArgumentException(
|
||||
$"V-memory address '{vAddress}' contains non-octal digit '{ch}' — DirectLOGIC V-addresses are octal (0-7)",
|
||||
nameof(vAddress));
|
||||
result = result * 8 + (uint)(ch - '0');
|
||||
if (result > ushort.MaxValue)
|
||||
throw new OverflowException(
|
||||
$"V-memory address '{vAddress}' exceeds the 16-bit Modbus PDU address range");
|
||||
}
|
||||
return (ushort)result;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// DirectLOGIC system V-memory starts at octal V40400 on DL260 / H2-ECOM100 in factory
|
||||
/// "absolute" addressing mode. Unlike user V-memory, the mapping is NOT a simple
|
||||
/// octal-to-decimal conversion — the CPU relocates the system bank to Modbus PDU 0x2100
|
||||
/// (decimal 8448). This helper returns the CPU-family base plus a user-supplied offset
|
||||
/// within the system bank.
|
||||
/// </summary>
|
||||
public const ushort SystemVMemoryBasePdu = 0x2100;
|
||||
|
||||
/// <param name="offsetWithinSystemBank">
|
||||
/// 0-based register offset within the system bank. Pass 0 for V40400 itself; pass 1 for
|
||||
/// V40401 (octal), and so on. NOT an octal-decoded value — the system bank lives at
|
||||
/// consecutive PDU addresses, so the offset is plain decimal.
|
||||
/// </param>
|
||||
public static ushort SystemVMemoryToPdu(ushort offsetWithinSystemBank)
|
||||
{
|
||||
var pdu = SystemVMemoryBasePdu + offsetWithinSystemBank;
|
||||
if (pdu > ushort.MaxValue)
|
||||
throw new OverflowException(
|
||||
$"System V-memory offset {offsetWithinSystemBank} maps past 0xFFFF");
|
||||
return (ushort)pdu;
|
||||
}
|
||||
|
||||
// Bit-memory bases per DL260 user manual §I/O-configuration.
|
||||
// Numbers after X / Y / C / SP are OCTAL in DirectLOGIC notation. The Modbus base is
|
||||
// added to the octal-decoded offset; e.g. Y017 = Modbus coil 2048 + octal(17) = 2048 + 15 = 2063.
|
||||
|
||||
/// <summary>
|
||||
/// DL260 Y-output coil base. Y0 octal → Modbus coil address 2048 (0-based).
|
||||
/// </summary>
|
||||
public const ushort YOutputBaseCoil = 2048;
|
||||
|
||||
/// <summary>
|
||||
/// DL260 C-relay coil base. C0 octal → Modbus coil address 3072 (0-based).
|
||||
/// </summary>
|
||||
public const ushort CRelayBaseCoil = 3072;
|
||||
|
||||
/// <summary>
|
||||
/// DL260 X-input discrete-input base. X0 octal → Modbus discrete input 0.
|
||||
/// </summary>
|
||||
public const ushort XInputBaseDiscrete = 0;
|
||||
|
||||
/// <summary>
|
||||
/// DL260 SP special-relay discrete-input base. SP0 octal → Modbus discrete input 1024.
|
||||
/// Read-only; writing SP relays is rejected with Illegal Data Address.
|
||||
/// </summary>
|
||||
public const ushort SpecialBaseDiscrete = 1024;
|
||||
|
||||
/// <summary>
|
||||
/// Translate a DirectLOGIC Y-output address (e.g. <c>"Y0"</c>, <c>"Y17"</c>) to its
|
||||
/// 0-based Modbus coil address on DL260. The trailing number is OCTAL, matching the
|
||||
/// ladder-logic editor's notation.
|
||||
/// </summary>
|
||||
public static ushort YOutputToCoil(string yAddress) =>
|
||||
AddOctalOffset(YOutputBaseCoil, StripPrefix(yAddress, 'Y'));
|
||||
|
||||
/// <summary>
|
||||
/// Translate a DirectLOGIC C-relay address (e.g. <c>"C0"</c>, <c>"C1777"</c>) to its
|
||||
/// 0-based Modbus coil address.
|
||||
/// </summary>
|
||||
public static ushort CRelayToCoil(string cAddress) =>
|
||||
AddOctalOffset(CRelayBaseCoil, StripPrefix(cAddress, 'C'));
|
||||
|
||||
/// <summary>
|
||||
/// Translate a DirectLOGIC X-input address (e.g. <c>"X0"</c>, <c>"X17"</c>) to its
|
||||
/// 0-based Modbus discrete-input address. Reading an unpopulated X returns 0, not an
|
||||
/// exception — the CPU sizes the table to configured I/O, not installed modules.
|
||||
/// </summary>
|
||||
public static ushort XInputToDiscrete(string xAddress) =>
|
||||
AddOctalOffset(XInputBaseDiscrete, StripPrefix(xAddress, 'X'));
|
||||
|
||||
/// <summary>
|
||||
/// Translate a DirectLOGIC SP-special-relay address (e.g. <c>"SP0"</c>) to its 0-based
|
||||
/// Modbus discrete-input address. Accepts <c>"SP"</c> prefix case-insensitively.
|
||||
/// </summary>
|
||||
public static ushort SpecialToDiscrete(string spAddress)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(spAddress))
|
||||
throw new ArgumentException("SP address must not be empty", nameof(spAddress));
|
||||
var s = spAddress.Trim();
|
||||
if (s.Length >= 2 && (s[0] == 'S' || s[0] == 's') && (s[1] == 'P' || s[1] == 'p'))
|
||||
s = s.Substring(2);
|
||||
return AddOctalOffset(SpecialBaseDiscrete, s);
|
||||
}
|
||||
|
||||
private static string StripPrefix(string address, char expectedPrefix)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(address))
|
||||
throw new ArgumentException("Address must not be empty", nameof(address));
|
||||
var s = address.Trim();
|
||||
if (s.Length > 0 && char.ToUpperInvariant(s[0]) == char.ToUpperInvariant(expectedPrefix))
|
||||
s = s.Substring(1);
|
||||
return s;
|
||||
}
|
||||
|
||||
private static ushort AddOctalOffset(ushort baseAddr, string octalDigits)
|
||||
{
|
||||
if (octalDigits.Length == 0)
|
||||
throw new ArgumentException("Address has no digits", nameof(octalDigits));
|
||||
uint offset = 0;
|
||||
foreach (var ch in octalDigits)
|
||||
{
|
||||
if (ch < '0' || ch > '7')
|
||||
throw new ArgumentException(
|
||||
$"Address contains non-octal digit '{ch}' — DirectLOGIC I/O addresses are octal (0-7)",
|
||||
nameof(octalDigits));
|
||||
offset = offset * 8 + (uint)(ch - '0');
|
||||
}
|
||||
var result = baseAddr + offset;
|
||||
if (result > ushort.MaxValue)
|
||||
throw new OverflowException($"Address {baseAddr}+{offset} exceeds 0xFFFF");
|
||||
return (ushort)result;
|
||||
}
|
||||
}
|
||||
161
src/ZB.MOM.WW.OtOpcUa.Driver.Modbus/MelsecAddress.cs
Normal file
161
src/ZB.MOM.WW.OtOpcUa.Driver.Modbus/MelsecAddress.cs
Normal file
@@ -0,0 +1,161 @@
|
||||
namespace ZB.MOM.WW.OtOpcUa.Driver.Modbus;
|
||||
|
||||
/// <summary>
|
||||
/// Mitsubishi MELSEC PLC family selector for address-translation helpers. The Q/L/iQ-R
|
||||
/// families write bit-device addresses (X, Y) in <b>hexadecimal</b> in GX Works and the
|
||||
/// CPU manuals; the FX and iQ-F families write them in <b>octal</b> (same convention as
|
||||
/// AutomationDirect DirectLOGIC). Mixing the two up is the #1 MELSEC driver bug source —
|
||||
/// an operator typing <c>X20</c> into a Q-series tag config means decimal 32, but the
|
||||
/// same string on an FX3U means decimal 16, so the helper must know the family to route
|
||||
/// correctly.
|
||||
/// </summary>
|
||||
public enum MelsecFamily
|
||||
{
|
||||
/// <summary>
|
||||
/// MELSEC-Q / MELSEC-L / MELSEC iQ-R. X and Y device numbers are interpreted as
|
||||
/// <b>hexadecimal</b>; <c>X20</c> means decimal 32.
|
||||
/// </summary>
|
||||
Q_L_iQR,
|
||||
|
||||
/// <summary>
|
||||
/// MELSEC-F (FX3U / FX3GE / FX3G) and MELSEC iQ-F (FX5U). X and Y device numbers
|
||||
/// are interpreted as <b>octal</b> (same as DirectLOGIC); <c>X20</c> means decimal 16.
|
||||
/// iQ-F has a GX Works3 project toggle that can flip to decimal — if a site uses
|
||||
/// that, configure the tag's Address directly as a decimal PDU address and do not
|
||||
/// route through this helper.
|
||||
/// </summary>
|
||||
F_iQF,
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Mitsubishi MELSEC address-translation helpers for the QJ71MT91 / LJ71MT91 / RJ71EN71 /
|
||||
/// iQ-R built-in / iQ-F / FX3U-ENET-P502 Modbus modules. MELSEC does NOT hard-wire
|
||||
/// Modbus-to-device mappings like DL260 does — every site configures its own "Modbus
|
||||
/// Device Assignment Parameter" block of up to 16 entries. The helpers here cover only
|
||||
/// the <b>address-notation</b> portion of the translation (hex X20 vs octal X20 + adding
|
||||
/// the bank base); the caller is still responsible for knowing the assignment-block
|
||||
/// offset for their site.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// See <c>docs/v2/mitsubishi.md</c> §device-assignment + §X-Y-hex-trap for the full
|
||||
/// matrix and primary-source citations.
|
||||
/// </remarks>
|
||||
public static class MelsecAddress
|
||||
{
|
||||
/// <summary>
|
||||
/// Translate a MELSEC X-input address (e.g. <c>"X0"</c>, <c>"X10"</c>) to a 0-based
|
||||
/// Modbus discrete-input address, given the PLC family's address notation (hex or
|
||||
/// octal) and the Modbus Device Assignment block's X-range base.
|
||||
/// </summary>
|
||||
/// <param name="xAddress">MELSEC X address. <c>X</c> prefix optional, case-insensitive.</param>
|
||||
/// <param name="family">The PLC family — determines whether the trailing digits are hex or octal.</param>
|
||||
/// <param name="xBankBase">
|
||||
/// 0-based Modbus DI address the assignment-block has configured X0 to land at.
|
||||
/// Typical default on QJ71MT91 sample projects: 0. Pass the site-specific value.
|
||||
/// </param>
|
||||
public static ushort XInputToDiscrete(string xAddress, MelsecFamily family, ushort xBankBase = 0) =>
|
||||
AddFamilyOffset(xBankBase, StripPrefix(xAddress, 'X'), family);
|
||||
|
||||
/// <summary>
|
||||
/// Translate a MELSEC Y-output address to a 0-based Modbus coil address. Same rules
|
||||
/// as <see cref="XInputToDiscrete"/> for hex/octal parsing.
|
||||
/// </summary>
|
||||
public static ushort YOutputToCoil(string yAddress, MelsecFamily family, ushort yBankBase = 0) =>
|
||||
AddFamilyOffset(yBankBase, StripPrefix(yAddress, 'Y'), family);
|
||||
|
||||
/// <summary>
|
||||
/// Translate a MELSEC M-relay address (internal relay) to a 0-based Modbus coil
|
||||
/// address. M-addresses are <b>decimal</b> on every MELSEC family — unlike X/Y which
|
||||
/// are hex on Q/L/iQ-R. Includes the bank base that the assignment-block configured.
|
||||
/// </summary>
|
||||
public static ushort MRelayToCoil(string mAddress, ushort mBankBase = 0)
|
||||
{
|
||||
var digits = StripPrefix(mAddress, 'M');
|
||||
if (!ushort.TryParse(digits, out var offset))
|
||||
throw new ArgumentException(
|
||||
$"M-relay address '{mAddress}' is not a valid decimal integer", nameof(mAddress));
|
||||
var result = mBankBase + offset;
|
||||
if (result > ushort.MaxValue)
|
||||
throw new OverflowException($"M-relay {mAddress} + base {mBankBase} exceeds 0xFFFF");
|
||||
return (ushort)result;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Translate a MELSEC D-register address (data register) to a 0-based Modbus holding
|
||||
/// register address. D-addresses are <b>decimal</b>. Default assignment convention is
|
||||
/// D0 → HR 0 (pass <paramref name="dBankBase"/> = 0); sites with shifted layouts pass
|
||||
/// their configured base.
|
||||
/// </summary>
|
||||
public static ushort DRegisterToHolding(string dAddress, ushort dBankBase = 0)
|
||||
{
|
||||
var digits = StripPrefix(dAddress, 'D');
|
||||
if (!ushort.TryParse(digits, out var offset))
|
||||
throw new ArgumentException(
|
||||
$"D-register address '{dAddress}' is not a valid decimal integer", nameof(dAddress));
|
||||
var result = dBankBase + offset;
|
||||
if (result > ushort.MaxValue)
|
||||
throw new OverflowException($"D-register {dAddress} + base {dBankBase} exceeds 0xFFFF");
|
||||
return (ushort)result;
|
||||
}
|
||||
|
||||
private static string StripPrefix(string address, char expectedPrefix)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(address))
|
||||
throw new ArgumentException("Address must not be empty", nameof(address));
|
||||
var s = address.Trim();
|
||||
if (s.Length > 0 && char.ToUpperInvariant(s[0]) == char.ToUpperInvariant(expectedPrefix))
|
||||
s = s.Substring(1);
|
||||
if (s.Length == 0)
|
||||
throw new ArgumentException($"Address '{address}' has no digits after prefix", nameof(address));
|
||||
return s;
|
||||
}
|
||||
|
||||
private static ushort AddFamilyOffset(ushort baseAddr, string digits, MelsecFamily family)
|
||||
{
|
||||
uint offset = family switch
|
||||
{
|
||||
MelsecFamily.Q_L_iQR => ParseHex(digits),
|
||||
MelsecFamily.F_iQF => ParseOctal(digits),
|
||||
_ => throw new ArgumentOutOfRangeException(nameof(family), family, "Unknown MELSEC family"),
|
||||
};
|
||||
var result = baseAddr + offset;
|
||||
if (result > ushort.MaxValue)
|
||||
throw new OverflowException($"Address {baseAddr}+{offset} exceeds 0xFFFF");
|
||||
return (ushort)result;
|
||||
}
|
||||
|
||||
private static uint ParseHex(string digits)
|
||||
{
|
||||
uint result = 0;
|
||||
foreach (var ch in digits)
|
||||
{
|
||||
uint nibble;
|
||||
if (ch >= '0' && ch <= '9') nibble = (uint)(ch - '0');
|
||||
else if (ch >= 'A' && ch <= 'F') nibble = (uint)(ch - 'A' + 10);
|
||||
else if (ch >= 'a' && ch <= 'f') nibble = (uint)(ch - 'a' + 10);
|
||||
else throw new ArgumentException(
|
||||
$"Address contains non-hex digit '{ch}' — Q/L/iQ-R X/Y addresses are hexadecimal",
|
||||
nameof(digits));
|
||||
result = result * 16 + nibble;
|
||||
if (result > ushort.MaxValue)
|
||||
throw new OverflowException($"Hex address exceeds 0xFFFF");
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
private static uint ParseOctal(string digits)
|
||||
{
|
||||
uint result = 0;
|
||||
foreach (var ch in digits)
|
||||
{
|
||||
if (ch < '0' || ch > '7')
|
||||
throw new ArgumentException(
|
||||
$"Address contains non-octal digit '{ch}' — FX/iQ-F X/Y addresses are octal (0-7)",
|
||||
nameof(digits));
|
||||
result = result * 8 + (uint)(ch - '0');
|
||||
if (result > ushort.MaxValue)
|
||||
throw new OverflowException($"Octal address exceeds 0xFFFF");
|
||||
}
|
||||
return result;
|
||||
}
|
||||
}
|
||||
@@ -37,7 +37,7 @@ public sealed class ModbusDriver(ModbusDriverOptions options, string driverInsta
|
||||
private CancellationTokenSource? _probeCts;
|
||||
private readonly ModbusDriverOptions _options = options;
|
||||
private readonly Func<ModbusDriverOptions, IModbusTransport> _transportFactory =
|
||||
transportFactory ?? (o => new ModbusTcpTransport(o.Host, o.Port, o.Timeout));
|
||||
transportFactory ?? (o => new ModbusTcpTransport(o.Host, o.Port, o.Timeout, o.AutoReconnect));
|
||||
|
||||
private IModbusTransport? _transport;
|
||||
private DriverHealth _health = new(DriverState.Unknown, null, null);
|
||||
@@ -115,7 +115,8 @@ public sealed class ModbusDriver(ModbusDriverOptions options, string driverInsta
|
||||
ArrayDim: null,
|
||||
SecurityClass: t.Writable ? SecurityClassification.Operate : SecurityClassification.ViewOnly,
|
||||
IsHistorized: false,
|
||||
IsAlarm: false));
|
||||
IsAlarm: false,
|
||||
WriteIdempotent: t.WriteIdempotent));
|
||||
}
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
@@ -141,9 +142,16 @@ public sealed class ModbusDriver(ModbusDriverOptions options, string driverInsta
|
||||
results[i] = new DataValueSnapshot(value, 0u, now, now);
|
||||
_health = new DriverHealth(DriverState.Healthy, now, null);
|
||||
}
|
||||
catch (ModbusException mex)
|
||||
{
|
||||
results[i] = new DataValueSnapshot(null, MapModbusExceptionToStatus(mex.ExceptionCode), null, now);
|
||||
_health = new DriverHealth(DriverState.Degraded, _health.LastSuccessfulRead, mex.Message);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
results[i] = new DataValueSnapshot(null, StatusBadInternalError, null, now);
|
||||
// Non-Modbus-layer failure: socket dropped, timeout, malformed response. Surface
|
||||
// as communication error so callers can distinguish it from tag-level faults.
|
||||
results[i] = new DataValueSnapshot(null, StatusBadCommunicationError, null, now);
|
||||
_health = new DriverHealth(DriverState.Degraded, _health.LastSuccessfulRead, ex.Message);
|
||||
}
|
||||
}
|
||||
@@ -169,20 +177,50 @@ public sealed class ModbusDriver(ModbusDriverOptions options, string driverInsta
|
||||
case ModbusRegion.HoldingRegisters:
|
||||
case ModbusRegion.InputRegisters:
|
||||
{
|
||||
var quantity = RegisterCount(tag.DataType);
|
||||
var quantity = RegisterCount(tag);
|
||||
var fc = tag.Region == ModbusRegion.HoldingRegisters ? (byte)0x03 : (byte)0x04;
|
||||
var pdu = new byte[] { fc, (byte)(tag.Address >> 8), (byte)(tag.Address & 0xFF),
|
||||
(byte)(quantity >> 8), (byte)(quantity & 0xFF) };
|
||||
var resp = await transport.SendAsync(_options.UnitId, pdu, ct).ConfigureAwait(false);
|
||||
// resp = [fc][byte-count][data...]
|
||||
var data = new ReadOnlySpan<byte>(resp, 2, resp[1]);
|
||||
return DecodeRegister(data, tag.DataType);
|
||||
// Auto-chunk when the tag's register span exceeds the caller-configured cap.
|
||||
// Affects long strings (FC03/04 > 125 regs is spec-forbidden; DL205 caps at 128,
|
||||
// Mitsubishi Q caps at 64). Non-string tags max out at 4 regs so the cap never
|
||||
// triggers for numerics.
|
||||
var cap = _options.MaxRegistersPerRead == 0 ? (ushort)125 : _options.MaxRegistersPerRead;
|
||||
var data = quantity <= cap
|
||||
? await ReadRegisterBlockAsync(transport, fc, tag.Address, quantity, ct).ConfigureAwait(false)
|
||||
: await ReadRegisterBlockChunkedAsync(transport, fc, tag.Address, quantity, cap, ct).ConfigureAwait(false);
|
||||
return DecodeRegister(data, tag);
|
||||
}
|
||||
default:
|
||||
throw new InvalidOperationException($"Unknown region {tag.Region}");
|
||||
}
|
||||
}
|
||||
|
||||
private async Task<byte[]> ReadRegisterBlockAsync(
|
||||
IModbusTransport transport, byte fc, ushort address, ushort quantity, CancellationToken ct)
|
||||
{
|
||||
var pdu = new byte[] { fc, (byte)(address >> 8), (byte)(address & 0xFF),
|
||||
(byte)(quantity >> 8), (byte)(quantity & 0xFF) };
|
||||
var resp = await transport.SendAsync(_options.UnitId, pdu, ct).ConfigureAwait(false);
|
||||
// resp = [fc][byte-count][data...]
|
||||
var data = new byte[resp[1]];
|
||||
Buffer.BlockCopy(resp, 2, data, 0, resp[1]);
|
||||
return data;
|
||||
}
|
||||
|
||||
private async Task<byte[]> ReadRegisterBlockChunkedAsync(
|
||||
IModbusTransport transport, byte fc, ushort address, ushort totalRegs, ushort cap, CancellationToken ct)
|
||||
{
|
||||
var assembled = new byte[totalRegs * 2];
|
||||
ushort done = 0;
|
||||
while (done < totalRegs)
|
||||
{
|
||||
var chunk = (ushort)Math.Min(cap, totalRegs - done);
|
||||
var chunkBytes = await ReadRegisterBlockAsync(transport, fc, (ushort)(address + done), chunk, ct).ConfigureAwait(false);
|
||||
Buffer.BlockCopy(chunkBytes, 0, assembled, done * 2, chunkBytes.Length);
|
||||
done += chunk;
|
||||
}
|
||||
return assembled;
|
||||
}
|
||||
|
||||
// ---- IWritable ----
|
||||
|
||||
public async Task<IReadOnlyList<WriteResult>> WriteAsync(
|
||||
@@ -208,6 +246,10 @@ public sealed class ModbusDriver(ModbusDriverOptions options, string driverInsta
|
||||
await WriteOneAsync(transport, tag, w.Value, cancellationToken).ConfigureAwait(false);
|
||||
results[i] = new WriteResult(0u);
|
||||
}
|
||||
catch (ModbusException mex)
|
||||
{
|
||||
results[i] = new WriteResult(MapModbusExceptionToStatus(mex.ExceptionCode));
|
||||
}
|
||||
catch (Exception)
|
||||
{
|
||||
results[i] = new WriteResult(StatusBadInternalError);
|
||||
@@ -230,7 +272,7 @@ public sealed class ModbusDriver(ModbusDriverOptions options, string driverInsta
|
||||
}
|
||||
case ModbusRegion.HoldingRegisters:
|
||||
{
|
||||
var bytes = EncodeRegister(value, tag.DataType);
|
||||
var bytes = EncodeRegister(value, tag);
|
||||
if (bytes.Length == 2)
|
||||
{
|
||||
var pdu = new byte[] { 0x06, (byte)(tag.Address >> 8), (byte)(tag.Address & 0xFF),
|
||||
@@ -239,8 +281,13 @@ public sealed class ModbusDriver(ModbusDriverOptions options, string driverInsta
|
||||
}
|
||||
else
|
||||
{
|
||||
// FC 16 (Write Multiple Registers) for 32-bit types
|
||||
// FC 16 (Write Multiple Registers) for 32-bit types.
|
||||
var qty = (ushort)(bytes.Length / 2);
|
||||
var writeCap = _options.MaxRegistersPerWrite == 0 ? (ushort)123 : _options.MaxRegistersPerWrite;
|
||||
if (qty > writeCap)
|
||||
throw new InvalidOperationException(
|
||||
$"Write of {qty} registers to {tag.Name} exceeds MaxRegistersPerWrite={writeCap}. " +
|
||||
$"Split the tag (e.g. shorter StringLength) — partial FC16 chunks would lose atomicity.");
|
||||
var pdu = new byte[6 + 1 + bytes.Length];
|
||||
pdu[0] = 0x10;
|
||||
pdu[1] = (byte)(tag.Address >> 8); pdu[2] = (byte)(tag.Address & 0xFF);
|
||||
@@ -397,82 +444,285 @@ public sealed class ModbusDriver(ModbusDriverOptions options, string driverInsta
|
||||
|
||||
// ---- codec ----
|
||||
|
||||
internal static ushort RegisterCount(ModbusDataType t) => t switch
|
||||
/// <summary>
|
||||
/// How many 16-bit registers a given tag occupies. Accounts for multi-register logical
|
||||
/// types (Int32/Float32 = 2 regs, Int64/Float64 = 4 regs) and for strings (rounded up
|
||||
/// from 2 chars per register).
|
||||
/// </summary>
|
||||
internal static ushort RegisterCount(ModbusTagDefinition tag) => tag.DataType switch
|
||||
{
|
||||
ModbusDataType.Int16 or ModbusDataType.UInt16 => 1,
|
||||
ModbusDataType.Int32 or ModbusDataType.UInt32 or ModbusDataType.Float32 => 2,
|
||||
_ => throw new InvalidOperationException($"Non-register data type {t}"),
|
||||
ModbusDataType.Int16 or ModbusDataType.UInt16 or ModbusDataType.BitInRegister or ModbusDataType.Bcd16 => 1,
|
||||
ModbusDataType.Int32 or ModbusDataType.UInt32 or ModbusDataType.Float32 or ModbusDataType.Bcd32 => 2,
|
||||
ModbusDataType.Int64 or ModbusDataType.UInt64 or ModbusDataType.Float64 => 4,
|
||||
ModbusDataType.String => (ushort)((tag.StringLength + 1) / 2), // 2 chars per register
|
||||
_ => throw new InvalidOperationException($"Non-register data type {tag.DataType}"),
|
||||
};
|
||||
|
||||
internal static object DecodeRegister(ReadOnlySpan<byte> data, ModbusDataType t) => t switch
|
||||
/// <summary>
|
||||
/// Word-swap the input into the big-endian layout the decoders expect. For 2-register
|
||||
/// types this reverses the two words; for 4-register types it reverses the four words
|
||||
/// (PLC stored [hi-mid, low-mid, hi-high, low-high] → memory [hi-high, low-high, hi-mid, low-mid]).
|
||||
/// </summary>
|
||||
private static byte[] NormalizeWordOrder(ReadOnlySpan<byte> data, ModbusByteOrder order)
|
||||
{
|
||||
ModbusDataType.Int16 => BinaryPrimitives.ReadInt16BigEndian(data),
|
||||
ModbusDataType.UInt16 => BinaryPrimitives.ReadUInt16BigEndian(data),
|
||||
ModbusDataType.Int32 => BinaryPrimitives.ReadInt32BigEndian(data),
|
||||
ModbusDataType.UInt32 => BinaryPrimitives.ReadUInt32BigEndian(data),
|
||||
ModbusDataType.Float32 => BinaryPrimitives.ReadSingleBigEndian(data),
|
||||
_ => throw new InvalidOperationException($"Non-register data type {t}"),
|
||||
};
|
||||
if (order == ModbusByteOrder.BigEndian) return data.ToArray();
|
||||
var result = new byte[data.Length];
|
||||
for (var word = 0; word < data.Length / 2; word++)
|
||||
{
|
||||
var srcWord = data.Length / 2 - 1 - word;
|
||||
result[word * 2] = data[srcWord * 2];
|
||||
result[word * 2 + 1] = data[srcWord * 2 + 1];
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
internal static byte[] EncodeRegister(object? value, ModbusDataType t)
|
||||
internal static object DecodeRegister(ReadOnlySpan<byte> data, ModbusTagDefinition tag)
|
||||
{
|
||||
switch (t)
|
||||
switch (tag.DataType)
|
||||
{
|
||||
case ModbusDataType.Int16: return BinaryPrimitives.ReadInt16BigEndian(data);
|
||||
case ModbusDataType.UInt16: return BinaryPrimitives.ReadUInt16BigEndian(data);
|
||||
case ModbusDataType.Bcd16:
|
||||
{
|
||||
var raw = BinaryPrimitives.ReadUInt16BigEndian(data);
|
||||
return (int)DecodeBcd(raw, nibbles: 4);
|
||||
}
|
||||
case ModbusDataType.Bcd32:
|
||||
{
|
||||
var b = NormalizeWordOrder(data, tag.ByteOrder);
|
||||
var raw = BinaryPrimitives.ReadUInt32BigEndian(b);
|
||||
return (int)DecodeBcd(raw, nibbles: 8);
|
||||
}
|
||||
case ModbusDataType.BitInRegister:
|
||||
{
|
||||
var raw = BinaryPrimitives.ReadUInt16BigEndian(data);
|
||||
return (raw & (1 << tag.BitIndex)) != 0;
|
||||
}
|
||||
case ModbusDataType.Int32:
|
||||
{
|
||||
var b = NormalizeWordOrder(data, tag.ByteOrder);
|
||||
return BinaryPrimitives.ReadInt32BigEndian(b);
|
||||
}
|
||||
case ModbusDataType.UInt32:
|
||||
{
|
||||
var b = NormalizeWordOrder(data, tag.ByteOrder);
|
||||
return BinaryPrimitives.ReadUInt32BigEndian(b);
|
||||
}
|
||||
case ModbusDataType.Float32:
|
||||
{
|
||||
var b = NormalizeWordOrder(data, tag.ByteOrder);
|
||||
return BinaryPrimitives.ReadSingleBigEndian(b);
|
||||
}
|
||||
case ModbusDataType.Int64:
|
||||
{
|
||||
var b = NormalizeWordOrder(data, tag.ByteOrder);
|
||||
return BinaryPrimitives.ReadInt64BigEndian(b);
|
||||
}
|
||||
case ModbusDataType.UInt64:
|
||||
{
|
||||
var b = NormalizeWordOrder(data, tag.ByteOrder);
|
||||
return BinaryPrimitives.ReadUInt64BigEndian(b);
|
||||
}
|
||||
case ModbusDataType.Float64:
|
||||
{
|
||||
var b = NormalizeWordOrder(data, tag.ByteOrder);
|
||||
return BinaryPrimitives.ReadDoubleBigEndian(b);
|
||||
}
|
||||
case ModbusDataType.String:
|
||||
{
|
||||
// ASCII, 2 chars per register. HighByteFirst (standard) packs the first char in
|
||||
// the high byte of each register; LowByteFirst (DL205/DL260) packs the first char
|
||||
// in the low byte. Respect StringLength (truncate nul-padded regions).
|
||||
var chars = new char[tag.StringLength];
|
||||
for (var i = 0; i < tag.StringLength; i++)
|
||||
{
|
||||
var regIdx = i / 2;
|
||||
var highByte = data[regIdx * 2];
|
||||
var lowByte = data[regIdx * 2 + 1];
|
||||
byte b;
|
||||
if (tag.StringByteOrder == ModbusStringByteOrder.HighByteFirst)
|
||||
b = (i % 2 == 0) ? highByte : lowByte;
|
||||
else
|
||||
b = (i % 2 == 0) ? lowByte : highByte;
|
||||
if (b == 0) return new string(chars, 0, i);
|
||||
chars[i] = (char)b;
|
||||
}
|
||||
return new string(chars);
|
||||
}
|
||||
default:
|
||||
throw new InvalidOperationException($"Non-register data type {tag.DataType}");
|
||||
}
|
||||
}
|
||||
|
||||
internal static byte[] EncodeRegister(object? value, ModbusTagDefinition tag)
|
||||
{
|
||||
switch (tag.DataType)
|
||||
{
|
||||
case ModbusDataType.Int16:
|
||||
{
|
||||
var v = Convert.ToInt16(value);
|
||||
var b = new byte[2];
|
||||
BinaryPrimitives.WriteInt16BigEndian(b, v);
|
||||
return b;
|
||||
var b = new byte[2]; BinaryPrimitives.WriteInt16BigEndian(b, v); return b;
|
||||
}
|
||||
case ModbusDataType.UInt16:
|
||||
{
|
||||
var v = Convert.ToUInt16(value);
|
||||
var b = new byte[2];
|
||||
BinaryPrimitives.WriteUInt16BigEndian(b, v);
|
||||
return b;
|
||||
var b = new byte[2]; BinaryPrimitives.WriteUInt16BigEndian(b, v); return b;
|
||||
}
|
||||
case ModbusDataType.Bcd16:
|
||||
{
|
||||
var v = Convert.ToUInt32(value);
|
||||
if (v > 9999) throw new OverflowException($"BCD16 value {v} exceeds 4 decimal digits");
|
||||
var raw = (ushort)EncodeBcd(v, nibbles: 4);
|
||||
var b = new byte[2]; BinaryPrimitives.WriteUInt16BigEndian(b, raw); return b;
|
||||
}
|
||||
case ModbusDataType.Bcd32:
|
||||
{
|
||||
var v = Convert.ToUInt32(value);
|
||||
if (v > 99_999_999u) throw new OverflowException($"BCD32 value {v} exceeds 8 decimal digits");
|
||||
var raw = EncodeBcd(v, nibbles: 8);
|
||||
var b = new byte[4]; BinaryPrimitives.WriteUInt32BigEndian(b, raw);
|
||||
return NormalizeWordOrder(b, tag.ByteOrder);
|
||||
}
|
||||
case ModbusDataType.Int32:
|
||||
{
|
||||
var v = Convert.ToInt32(value);
|
||||
var b = new byte[4];
|
||||
BinaryPrimitives.WriteInt32BigEndian(b, v);
|
||||
return b;
|
||||
var b = new byte[4]; BinaryPrimitives.WriteInt32BigEndian(b, v);
|
||||
return NormalizeWordOrder(b, tag.ByteOrder);
|
||||
}
|
||||
case ModbusDataType.UInt32:
|
||||
{
|
||||
var v = Convert.ToUInt32(value);
|
||||
var b = new byte[4];
|
||||
BinaryPrimitives.WriteUInt32BigEndian(b, v);
|
||||
return b;
|
||||
var b = new byte[4]; BinaryPrimitives.WriteUInt32BigEndian(b, v);
|
||||
return NormalizeWordOrder(b, tag.ByteOrder);
|
||||
}
|
||||
case ModbusDataType.Float32:
|
||||
{
|
||||
var v = Convert.ToSingle(value);
|
||||
var b = new byte[4];
|
||||
BinaryPrimitives.WriteSingleBigEndian(b, v);
|
||||
var b = new byte[4]; BinaryPrimitives.WriteSingleBigEndian(b, v);
|
||||
return NormalizeWordOrder(b, tag.ByteOrder);
|
||||
}
|
||||
case ModbusDataType.Int64:
|
||||
{
|
||||
var v = Convert.ToInt64(value);
|
||||
var b = new byte[8]; BinaryPrimitives.WriteInt64BigEndian(b, v);
|
||||
return NormalizeWordOrder(b, tag.ByteOrder);
|
||||
}
|
||||
case ModbusDataType.UInt64:
|
||||
{
|
||||
var v = Convert.ToUInt64(value);
|
||||
var b = new byte[8]; BinaryPrimitives.WriteUInt64BigEndian(b, v);
|
||||
return NormalizeWordOrder(b, tag.ByteOrder);
|
||||
}
|
||||
case ModbusDataType.Float64:
|
||||
{
|
||||
var v = Convert.ToDouble(value);
|
||||
var b = new byte[8]; BinaryPrimitives.WriteDoubleBigEndian(b, v);
|
||||
return NormalizeWordOrder(b, tag.ByteOrder);
|
||||
}
|
||||
case ModbusDataType.String:
|
||||
{
|
||||
var s = Convert.ToString(value) ?? string.Empty;
|
||||
var regs = (tag.StringLength + 1) / 2;
|
||||
var b = new byte[regs * 2];
|
||||
for (var i = 0; i < tag.StringLength && i < s.Length; i++)
|
||||
{
|
||||
var regIdx = i / 2;
|
||||
var destIdx = tag.StringByteOrder == ModbusStringByteOrder.HighByteFirst
|
||||
? (i % 2 == 0 ? regIdx * 2 : regIdx * 2 + 1)
|
||||
: (i % 2 == 0 ? regIdx * 2 + 1 : regIdx * 2);
|
||||
b[destIdx] = (byte)s[i];
|
||||
}
|
||||
// remaining bytes stay 0 — nul-padded per PLC convention
|
||||
return b;
|
||||
}
|
||||
case ModbusDataType.BitInRegister:
|
||||
throw new InvalidOperationException(
|
||||
"BitInRegister writes require a read-modify-write; not supported in PR 24 (separate follow-up).");
|
||||
default:
|
||||
throw new InvalidOperationException($"Non-register data type {t}");
|
||||
throw new InvalidOperationException($"Non-register data type {tag.DataType}");
|
||||
}
|
||||
}
|
||||
|
||||
private static DriverDataType MapDataType(ModbusDataType t) => t switch
|
||||
{
|
||||
ModbusDataType.Bool => DriverDataType.Boolean,
|
||||
ModbusDataType.Bool or ModbusDataType.BitInRegister => DriverDataType.Boolean,
|
||||
ModbusDataType.Int16 or ModbusDataType.Int32 => DriverDataType.Int32,
|
||||
ModbusDataType.UInt16 or ModbusDataType.UInt32 => DriverDataType.Int32,
|
||||
ModbusDataType.Int64 or ModbusDataType.UInt64 => DriverDataType.Int32, // widening to Int32 loses precision; PR 25 adds Int64 to DriverDataType
|
||||
ModbusDataType.Float32 => DriverDataType.Float32,
|
||||
ModbusDataType.Float64 => DriverDataType.Float64,
|
||||
ModbusDataType.String => DriverDataType.String,
|
||||
ModbusDataType.Bcd16 or ModbusDataType.Bcd32 => DriverDataType.Int32,
|
||||
_ => DriverDataType.Int32,
|
||||
};
|
||||
|
||||
/// <summary>
|
||||
/// Decode an N-nibble binary-coded-decimal value. Each nibble of <paramref name="raw"/>
|
||||
/// encodes one decimal digit (most-significant nibble first). Rejects nibbles > 9 —
|
||||
/// the hardware sometimes produces garbage during transitions and silent non-BCD reads
|
||||
/// would quietly corrupt the caller's data.
|
||||
/// </summary>
|
||||
internal static uint DecodeBcd(uint raw, int nibbles)
|
||||
{
|
||||
uint result = 0;
|
||||
for (var i = nibbles - 1; i >= 0; i--)
|
||||
{
|
||||
var digit = (raw >> (i * 4)) & 0xF;
|
||||
if (digit > 9)
|
||||
throw new InvalidDataException(
|
||||
$"Non-BCD nibble 0x{digit:X} at position {i} of raw=0x{raw:X}");
|
||||
result = result * 10 + digit;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Encode a decimal value as N-nibble BCD. Caller is responsible for range-checking
|
||||
/// against the nibble capacity (10^nibbles - 1).
|
||||
/// </summary>
|
||||
internal static uint EncodeBcd(uint value, int nibbles)
|
||||
{
|
||||
uint result = 0;
|
||||
for (var i = 0; i < nibbles; i++)
|
||||
{
|
||||
var digit = value % 10;
|
||||
result |= digit << (i * 4);
|
||||
value /= 10;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
private IModbusTransport RequireTransport() =>
|
||||
_transport ?? throw new InvalidOperationException("ModbusDriver not initialized");
|
||||
|
||||
private const uint StatusBadInternalError = 0x80020000u;
|
||||
private const uint StatusBadNodeIdUnknown = 0x80340000u;
|
||||
private const uint StatusBadNotWritable = 0x803B0000u;
|
||||
private const uint StatusBadOutOfRange = 0x803C0000u;
|
||||
private const uint StatusBadNotSupported = 0x803D0000u;
|
||||
private const uint StatusBadDeviceFailure = 0x80550000u;
|
||||
private const uint StatusBadCommunicationError = 0x80050000u;
|
||||
|
||||
/// <summary>
|
||||
/// Map a server-returned Modbus exception code to the most informative OPC UA
|
||||
/// StatusCode. Keeps the driver's outward-facing status surface aligned with what a
|
||||
/// Modbus engineer would expect when reading the spec: exception 02 (Illegal Data
|
||||
/// Address) surfaces as BadOutOfRange so clients can distinguish "tag wrong" from
|
||||
/// generic BadInternalError, exception 04 (Server Failure) as BadDeviceFailure so
|
||||
/// operators see a CPU-mode problem rather than a driver bug, etc. Per
|
||||
/// <c>docs/v2/dl205.md</c>, DL205/DL260 returns only codes 01-04 — no proprietary
|
||||
/// extensions.
|
||||
/// </summary>
|
||||
internal static uint MapModbusExceptionToStatus(byte exceptionCode) => exceptionCode switch
|
||||
{
|
||||
0x01 => StatusBadNotSupported, // Illegal Function — FC not in supported list
|
||||
0x02 => StatusBadOutOfRange, // Illegal Data Address — register outside mapped range
|
||||
0x03 => StatusBadOutOfRange, // Illegal Data Value — quantity over per-FC cap
|
||||
0x04 => StatusBadDeviceFailure, // Server Failure — CPU in PROGRAM mode during protected write
|
||||
0x05 or 0x06 => StatusBadDeviceFailure, // Acknowledge / Server Busy — long-running op / busy
|
||||
0x0A or 0x0B => StatusBadCommunicationError, // Gateway path unavailable / target failed to respond
|
||||
_ => StatusBadInternalError,
|
||||
};
|
||||
|
||||
public void Dispose() => DisposeAsync().AsTask().GetAwaiter().GetResult();
|
||||
public async ValueTask DisposeAsync()
|
||||
|
||||
@@ -25,6 +25,37 @@ public sealed class ModbusDriverOptions
|
||||
/// <see cref="IHostConnectivityProbe"/>.
|
||||
/// </summary>
|
||||
public ModbusProbeOptions Probe { get; init; } = new();
|
||||
|
||||
/// <summary>
|
||||
/// Maximum registers per FC03 (Read Holding Registers) / FC04 (Read Input Registers)
|
||||
/// transaction. Modbus-TCP spec allows 125; many device families impose lower caps:
|
||||
/// AutomationDirect DL205/DL260 cap at <c>128</c>, Mitsubishi Q/FX3U cap at <c>64</c>,
|
||||
/// Omron CJ/CS cap at <c>125</c>. Set to the lowest cap across the devices this driver
|
||||
/// instance talks to; the driver auto-chunks larger reads into consecutive requests.
|
||||
/// Default <c>125</c> — the spec maximum, safe against any conforming server. Setting
|
||||
/// to <c>0</c> disables the cap (discouraged — the spec upper bound still applies).
|
||||
/// </summary>
|
||||
public ushort MaxRegistersPerRead { get; init; } = 125;
|
||||
|
||||
/// <summary>
|
||||
/// Maximum registers per FC16 (Write Multiple Registers) transaction. Spec maximum is
|
||||
/// <c>123</c>; DL205/DL260 cap at <c>100</c>. Matching caller-vs-device semantics:
|
||||
/// exceeding the cap currently throws (writes aren't auto-chunked because a partial
|
||||
/// write across two FC16 calls is no longer atomic — caller must explicitly opt in
|
||||
/// by shortening the tag's <c>StringLength</c> or splitting it into multiple tags).
|
||||
/// </summary>
|
||||
public ushort MaxRegistersPerWrite { get; init; } = 123;
|
||||
|
||||
/// <summary>
|
||||
/// When <c>true</c> (default) the built-in <see cref="ModbusTcpTransport"/> detects
|
||||
/// mid-transaction socket failures (<see cref="System.IO.EndOfStreamException"/>,
|
||||
/// <see cref="System.Net.Sockets.SocketException"/>) and transparently reconnects +
|
||||
/// retries the PDU exactly once. Required for DL205/DL260 because the H2-ECOM100
|
||||
/// does not send TCP keepalives — intermediate NAT / firewall devices silently close
|
||||
/// idle sockets and the first send after the drop would otherwise surface as a
|
||||
/// connection error to the caller even though the PLC is up.
|
||||
/// </summary>
|
||||
public bool AutoReconnect { get; init; } = true;
|
||||
}
|
||||
|
||||
public sealed class ModbusProbeOptions
|
||||
@@ -38,7 +69,9 @@ public sealed class ModbusProbeOptions
|
||||
|
||||
/// <summary>
|
||||
/// One Modbus-backed OPC UA variable. Address is zero-based (Modbus spec numbering, not
|
||||
/// the documentation's 1-based coil/register conventions).
|
||||
/// the documentation's 1-based coil/register conventions). Multi-register types
|
||||
/// (Int32/UInt32/Float32 = 2 regs; Int64/UInt64/Float64 = 4 regs) respect the
|
||||
/// <see cref="ByteOrder"/> field — real-world PLCs disagree on word ordering.
|
||||
/// </summary>
|
||||
/// <param name="Name">
|
||||
/// Tag name, used for both the OPC UA browse name and the driver's full reference. Must be
|
||||
@@ -46,14 +79,92 @@ public sealed class ModbusProbeOptions
|
||||
/// </param>
|
||||
/// <param name="Region">Coils / DiscreteInputs / InputRegisters / HoldingRegisters.</param>
|
||||
/// <param name="Address">Zero-based address within the region.</param>
|
||||
/// <param name="DataType">Logical data type. Int16/UInt16 = single register; Int32/UInt32/Float32 = two registers big-endian.</param>
|
||||
/// <param name="DataType">
|
||||
/// Logical data type. See <see cref="ModbusDataType"/> for the register count each encodes.
|
||||
/// </param>
|
||||
/// <param name="Writable">When true and Region supports writes (Coils / HoldingRegisters), IWritable routes writes here.</param>
|
||||
/// <param name="ByteOrder">Word ordering for multi-register types. Ignored for Bool / Int16 / UInt16 / BitInRegister / String.</param>
|
||||
/// <param name="BitIndex">For <c>DataType = BitInRegister</c>: which bit of the holding register (0-15, LSB-first).</param>
|
||||
/// <param name="StringLength">For <c>DataType = String</c>: number of ASCII characters (2 per register, rounded up).</param>
|
||||
/// <param name="StringByteOrder">
|
||||
/// Per-register byte order for <c>DataType = String</c>. Standard Modbus packs the first
|
||||
/// character in the high byte (<see cref="ModbusStringByteOrder.HighByteFirst"/>).
|
||||
/// AutomationDirect DirectLOGIC (DL205/DL260) and a few legacy families pack the first
|
||||
/// character in the low byte instead — see <c>docs/v2/dl205.md</c> §strings.
|
||||
/// </param>
|
||||
/// <param name="WriteIdempotent">
|
||||
/// Per <c>docs/v2/plan.md</c> decisions #44, #45, #143 — flag a tag as safe to replay on
|
||||
/// write timeout / failure. Default <c>false</c>; writes do not auto-retry. Safe candidates:
|
||||
/// holding-register set-points for analog values and configuration registers where the same
|
||||
/// value can be written again without side-effects. Unsafe: coils that drive edge-triggered
|
||||
/// actions (pulse outputs), counter-increment addresses on PLCs that treat writes as deltas,
|
||||
/// any BCD / counter register where repeat-writes advance state.
|
||||
/// </param>
|
||||
public sealed record ModbusTagDefinition(
|
||||
string Name,
|
||||
ModbusRegion Region,
|
||||
ushort Address,
|
||||
ModbusDataType DataType,
|
||||
bool Writable = true);
|
||||
bool Writable = true,
|
||||
ModbusByteOrder ByteOrder = ModbusByteOrder.BigEndian,
|
||||
byte BitIndex = 0,
|
||||
ushort StringLength = 0,
|
||||
ModbusStringByteOrder StringByteOrder = ModbusStringByteOrder.HighByteFirst,
|
||||
bool WriteIdempotent = false);
|
||||
|
||||
public enum ModbusRegion { Coils, DiscreteInputs, InputRegisters, HoldingRegisters }
|
||||
public enum ModbusDataType { Bool, Int16, UInt16, Int32, UInt32, Float32 }
|
||||
|
||||
public enum ModbusDataType
|
||||
{
|
||||
Bool,
|
||||
Int16,
|
||||
UInt16,
|
||||
Int32,
|
||||
UInt32,
|
||||
Int64,
|
||||
UInt64,
|
||||
Float32,
|
||||
Float64,
|
||||
/// <summary>Single bit within a holding register. <see cref="ModbusTagDefinition.BitIndex"/> selects 0-15 LSB-first.</summary>
|
||||
BitInRegister,
|
||||
/// <summary>ASCII string packed 2 chars per register, <see cref="ModbusTagDefinition.StringLength"/> characters long.</summary>
|
||||
String,
|
||||
/// <summary>
|
||||
/// 16-bit binary-coded decimal. Each nibble encodes one decimal digit (0-9). Register
|
||||
/// value <c>0x1234</c> decodes as decimal <c>1234</c> — NOT binary <c>0x04D2 = 4660</c>.
|
||||
/// DL205/DL260 and several Mitsubishi / Omron families store timers, counters, and
|
||||
/// operator-facing numerics as BCD by default.
|
||||
/// </summary>
|
||||
Bcd16,
|
||||
/// <summary>
|
||||
/// 32-bit (two-register) BCD. Decodes 8 decimal digits. Word ordering follows
|
||||
/// <see cref="ModbusTagDefinition.ByteOrder"/> the same way <see cref="Int32"/> does.
|
||||
/// </summary>
|
||||
Bcd32,
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Word ordering for multi-register types. Modbus TCP standard is <see cref="BigEndian"/>
|
||||
/// (ABCD for 32-bit: high word at the lower address). Many PLCs — Siemens S7, several
|
||||
/// Allen-Bradley series, some Modicon families — use <see cref="WordSwap"/> (CDAB), which
|
||||
/// keeps bytes big-endian within each register but reverses the word pair(s).
|
||||
/// </summary>
|
||||
public enum ModbusByteOrder
|
||||
{
|
||||
BigEndian,
|
||||
WordSwap,
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Per-register byte order for ASCII strings packed 2 chars per register. Standard Modbus
|
||||
/// convention is <see cref="HighByteFirst"/> — the first character of each pair occupies
|
||||
/// the high byte of the register. AutomationDirect DirectLOGIC (DL205, DL260, DL350) and a
|
||||
/// handful of legacy controllers pack <see cref="LowByteFirst"/>, which inverts that within
|
||||
/// each register. Word ordering across multiple registers is always ascending address for
|
||||
/// strings — only the byte order inside each register flips.
|
||||
/// </summary>
|
||||
public enum ModbusStringByteOrder
|
||||
{
|
||||
HighByteFirst,
|
||||
LowByteFirst,
|
||||
}
|
||||
|
||||
@@ -8,33 +8,83 @@ namespace ZB.MOM.WW.OtOpcUa.Driver.Modbus;
|
||||
/// support concurrent transactions, but the single-flight model keeps the wire trace
|
||||
/// easy to diagnose and avoids interleaved-response correlation bugs.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// Survives mid-transaction socket drops: when a send/read fails with a socket-level
|
||||
/// error (<see cref="IOException"/>, <see cref="SocketException"/>, <see cref="EndOfStreamException"/>)
|
||||
/// the transport disposes the dead socket, reconnects, and retries the PDU exactly
|
||||
/// once. Deliberately limited to a single retry — further failures bubble up so the
|
||||
/// driver's health surface reflects the real state instead of masking a dead PLC.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// Why this matters for DL205/DL260: the AutomationDirect H2-ECOM100 does NOT send
|
||||
/// TCP keepalives per <c>docs/v2/dl205.md</c> §behavioral-oddities, so any NAT/firewall
|
||||
/// between the gateway and PLC can silently close an idle socket after 2-5 minutes.
|
||||
/// Also enables OS-level <c>SO_KEEPALIVE</c> so the driver's own side detects a stuck
|
||||
/// socket in reasonable time even when the application is mostly idle.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public sealed class ModbusTcpTransport : IModbusTransport
|
||||
{
|
||||
private readonly string _host;
|
||||
private readonly int _port;
|
||||
private readonly TimeSpan _timeout;
|
||||
private readonly bool _autoReconnect;
|
||||
private readonly SemaphoreSlim _gate = new(1, 1);
|
||||
private TcpClient? _client;
|
||||
private NetworkStream? _stream;
|
||||
private ushort _nextTx;
|
||||
private bool _disposed;
|
||||
|
||||
public ModbusTcpTransport(string host, int port, TimeSpan timeout)
|
||||
public ModbusTcpTransport(string host, int port, TimeSpan timeout, bool autoReconnect = true)
|
||||
{
|
||||
_host = host;
|
||||
_port = port;
|
||||
_timeout = timeout;
|
||||
_autoReconnect = autoReconnect;
|
||||
}
|
||||
|
||||
public async Task ConnectAsync(CancellationToken ct)
|
||||
{
|
||||
_client = new TcpClient();
|
||||
// Resolve the host explicitly + prefer IPv4. .NET's TcpClient default-constructor is
|
||||
// dual-stack (IPv6 first, fallback to IPv4) — but most Modbus TCP devices (PLCs and
|
||||
// simulators like pymodbus) bind 0.0.0.0 only, so the IPv6 attempt times out and we
|
||||
// burn the entire ConnectAsync budget before even trying IPv4. Resolving first +
|
||||
// dialing the IPv4 address directly sidesteps that.
|
||||
var addresses = await System.Net.Dns.GetHostAddressesAsync(_host, ct).ConfigureAwait(false);
|
||||
var ipv4 = System.Linq.Enumerable.FirstOrDefault(addresses,
|
||||
a => a.AddressFamily == System.Net.Sockets.AddressFamily.InterNetwork);
|
||||
var target = ipv4 ?? (addresses.Length > 0 ? addresses[0] : System.Net.IPAddress.Loopback);
|
||||
|
||||
_client = new TcpClient(target.AddressFamily);
|
||||
EnableKeepAlive(_client);
|
||||
|
||||
using var cts = CancellationTokenSource.CreateLinkedTokenSource(ct);
|
||||
cts.CancelAfter(_timeout);
|
||||
await _client.ConnectAsync(_host, _port, cts.Token).ConfigureAwait(false);
|
||||
await _client.ConnectAsync(target, _port, cts.Token).ConfigureAwait(false);
|
||||
_stream = _client.GetStream();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Enable SO_KEEPALIVE with aggressive probe timing. DL205/DL260 doesn't send keepalives
|
||||
/// itself; having the OS probe the socket every ~30s lets the driver notice a dead PLC
|
||||
/// or broken NAT path long before the default 2-hour Windows idle timeout fires.
|
||||
/// Non-fatal if the underlying OS rejects the option (some older Linux / container
|
||||
/// sandboxes don't expose the fine-grained timing levers — the driver still works,
|
||||
/// application-level probe still detects problems).
|
||||
/// </summary>
|
||||
private static void EnableKeepAlive(TcpClient client)
|
||||
{
|
||||
try
|
||||
{
|
||||
client.Client.SetSocketOption(SocketOptionLevel.Socket, SocketOptionName.KeepAlive, true);
|
||||
client.Client.SetSocketOption(SocketOptionLevel.Tcp, SocketOptionName.TcpKeepAliveTime, 30);
|
||||
client.Client.SetSocketOption(SocketOptionLevel.Tcp, SocketOptionName.TcpKeepAliveInterval, 10);
|
||||
client.Client.SetSocketOption(SocketOptionLevel.Tcp, SocketOptionName.TcpKeepAliveRetryCount, 3);
|
||||
}
|
||||
catch { /* best-effort; older OSes may not expose the granular knobs */ }
|
||||
}
|
||||
|
||||
public async Task<byte[]> SendAsync(byte unitId, byte[] pdu, CancellationToken ct)
|
||||
{
|
||||
if (_disposed) throw new ObjectDisposedException(nameof(ModbusTcpTransport));
|
||||
@@ -43,43 +93,18 @@ public sealed class ModbusTcpTransport : IModbusTransport
|
||||
await _gate.WaitAsync(ct).ConfigureAwait(false);
|
||||
try
|
||||
{
|
||||
var txId = ++_nextTx;
|
||||
|
||||
// MBAP: [TxId(2)][Proto=0(2)][Length(2)][UnitId(1)] + PDU
|
||||
var adu = new byte[7 + pdu.Length];
|
||||
adu[0] = (byte)(txId >> 8);
|
||||
adu[1] = (byte)(txId & 0xFF);
|
||||
// protocol id already zero
|
||||
var len = (ushort)(1 + pdu.Length); // unit id + pdu
|
||||
adu[4] = (byte)(len >> 8);
|
||||
adu[5] = (byte)(len & 0xFF);
|
||||
adu[6] = unitId;
|
||||
Buffer.BlockCopy(pdu, 0, adu, 7, pdu.Length);
|
||||
|
||||
using var cts = CancellationTokenSource.CreateLinkedTokenSource(ct);
|
||||
cts.CancelAfter(_timeout);
|
||||
await _stream.WriteAsync(adu.AsMemory(), cts.Token).ConfigureAwait(false);
|
||||
await _stream.FlushAsync(cts.Token).ConfigureAwait(false);
|
||||
|
||||
var header = new byte[7];
|
||||
await ReadExactlyAsync(_stream, header, cts.Token).ConfigureAwait(false);
|
||||
var respTxId = (ushort)((header[0] << 8) | header[1]);
|
||||
if (respTxId != txId)
|
||||
throw new InvalidDataException($"Modbus TxId mismatch: expected {txId} got {respTxId}");
|
||||
var respLen = (ushort)((header[4] << 8) | header[5]);
|
||||
if (respLen < 1) throw new InvalidDataException($"Modbus response length too small: {respLen}");
|
||||
var respPdu = new byte[respLen - 1];
|
||||
await ReadExactlyAsync(_stream, respPdu, cts.Token).ConfigureAwait(false);
|
||||
|
||||
// Exception PDU: function code has high bit set.
|
||||
if ((respPdu[0] & 0x80) != 0)
|
||||
try
|
||||
{
|
||||
var fc = (byte)(respPdu[0] & 0x7F);
|
||||
var ex = respPdu[1];
|
||||
throw new ModbusException(fc, ex, $"Modbus exception fc={fc} code={ex}");
|
||||
return await SendOnceAsync(unitId, pdu, ct).ConfigureAwait(false);
|
||||
}
|
||||
catch (Exception ex) when (_autoReconnect && IsSocketLevelFailure(ex))
|
||||
{
|
||||
// Mid-transaction drop: tear down the dead socket, reconnect, resend. Single
|
||||
// retry — if it fails again, let it propagate so health/status reflect reality.
|
||||
await TearDownAsync().ConfigureAwait(false);
|
||||
await ConnectAsync(ct).ConfigureAwait(false);
|
||||
return await SendOnceAsync(unitId, pdu, ct).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
return respPdu;
|
||||
}
|
||||
finally
|
||||
{
|
||||
@@ -87,6 +112,68 @@ public sealed class ModbusTcpTransport : IModbusTransport
|
||||
}
|
||||
}
|
||||
|
||||
private async Task<byte[]> SendOnceAsync(byte unitId, byte[] pdu, CancellationToken ct)
|
||||
{
|
||||
if (_stream is null) throw new InvalidOperationException("Transport not connected");
|
||||
var txId = ++_nextTx;
|
||||
|
||||
// MBAP: [TxId(2)][Proto=0(2)][Length(2)][UnitId(1)] + PDU
|
||||
var adu = new byte[7 + pdu.Length];
|
||||
adu[0] = (byte)(txId >> 8);
|
||||
adu[1] = (byte)(txId & 0xFF);
|
||||
// protocol id already zero
|
||||
var len = (ushort)(1 + pdu.Length); // unit id + pdu
|
||||
adu[4] = (byte)(len >> 8);
|
||||
adu[5] = (byte)(len & 0xFF);
|
||||
adu[6] = unitId;
|
||||
Buffer.BlockCopy(pdu, 0, adu, 7, pdu.Length);
|
||||
|
||||
using var cts = CancellationTokenSource.CreateLinkedTokenSource(ct);
|
||||
cts.CancelAfter(_timeout);
|
||||
await _stream.WriteAsync(adu.AsMemory(), cts.Token).ConfigureAwait(false);
|
||||
await _stream.FlushAsync(cts.Token).ConfigureAwait(false);
|
||||
|
||||
var header = new byte[7];
|
||||
await ReadExactlyAsync(_stream, header, cts.Token).ConfigureAwait(false);
|
||||
var respTxId = (ushort)((header[0] << 8) | header[1]);
|
||||
if (respTxId != txId)
|
||||
throw new InvalidDataException($"Modbus TxId mismatch: expected {txId} got {respTxId}");
|
||||
var respLen = (ushort)((header[4] << 8) | header[5]);
|
||||
if (respLen < 1) throw new InvalidDataException($"Modbus response length too small: {respLen}");
|
||||
var respPdu = new byte[respLen - 1];
|
||||
await ReadExactlyAsync(_stream, respPdu, cts.Token).ConfigureAwait(false);
|
||||
|
||||
// Exception PDU: function code has high bit set.
|
||||
if ((respPdu[0] & 0x80) != 0)
|
||||
{
|
||||
var fc = (byte)(respPdu[0] & 0x7F);
|
||||
var ex = respPdu[1];
|
||||
throw new ModbusException(fc, ex, $"Modbus exception fc={fc} code={ex}");
|
||||
}
|
||||
|
||||
return respPdu;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Distinguish socket-layer failures (eligible for reconnect-and-retry) from
|
||||
/// protocol-layer failures (must propagate — retrying the same PDU won't help if the
|
||||
/// PLC just returned exception 02 Illegal Data Address).
|
||||
/// </summary>
|
||||
private static bool IsSocketLevelFailure(Exception ex) =>
|
||||
ex is EndOfStreamException
|
||||
|| ex is IOException
|
||||
|| ex is SocketException
|
||||
|| ex is ObjectDisposedException;
|
||||
|
||||
private async Task TearDownAsync()
|
||||
{
|
||||
try { if (_stream is not null) await _stream.DisposeAsync().ConfigureAwait(false); }
|
||||
catch { /* best-effort */ }
|
||||
_stream = null;
|
||||
try { _client?.Dispose(); } catch { }
|
||||
_client = null;
|
||||
}
|
||||
|
||||
private static async Task ReadExactlyAsync(Stream s, byte[] buf, CancellationToken ct)
|
||||
{
|
||||
var read = 0;
|
||||
|
||||
1384
src/ZB.MOM.WW.OtOpcUa.Driver.OpcUaClient/OpcUaClientDriver.cs
Normal file
1384
src/ZB.MOM.WW.OtOpcUa.Driver.OpcUaClient/OpcUaClientDriver.cs
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,180 @@
|
||||
namespace ZB.MOM.WW.OtOpcUa.Driver.OpcUaClient;
|
||||
|
||||
/// <summary>
|
||||
/// OPC UA Client (gateway) driver configuration. Bound from <c>DriverConfig</c> JSON at
|
||||
/// driver-host registration time. Models the settings documented in
|
||||
/// <c>docs/v2/driver-specs.md</c> §8.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// This driver connects to a REMOTE OPC UA server and re-exposes its address space
|
||||
/// through the local OtOpcUa server — the opposite direction from the usual "server
|
||||
/// exposes PLC data" flow. Tier A (pure managed, OPC Foundation reference SDK); universal
|
||||
/// protections cover it.
|
||||
/// </remarks>
|
||||
public sealed class OpcUaClientDriverOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// Remote OPC UA endpoint URL, e.g. <c>opc.tcp://plc.internal:4840</c>. Convenience
|
||||
/// shortcut for a single-endpoint deployment — equivalent to setting
|
||||
/// <see cref="EndpointUrls"/> to a list with this one URL. When both are provided,
|
||||
/// the list wins and <see cref="EndpointUrl"/> is ignored.
|
||||
/// </summary>
|
||||
public string EndpointUrl { get; init; } = "opc.tcp://localhost:4840";
|
||||
|
||||
/// <summary>
|
||||
/// Ordered list of candidate endpoint URLs for failover. The driver tries each in
|
||||
/// order at <see cref="OpcUaClientDriver.InitializeAsync"/> and on session drop;
|
||||
/// the first URL that successfully connects wins. Typical use-case: an OPC UA server
|
||||
/// pair running in hot-standby (primary 4840 + backup 4841) where either can serve
|
||||
/// the same address space. Leave unset (or empty) to use <see cref="EndpointUrl"/>
|
||||
/// as a single-URL shortcut.
|
||||
/// </summary>
|
||||
public IReadOnlyList<string> EndpointUrls { get; init; } = [];
|
||||
|
||||
/// <summary>
|
||||
/// Per-endpoint connect-attempt timeout during the failover sweep. Short enough that
|
||||
/// cycling through several dead servers doesn't blow the overall init budget, long
|
||||
/// enough to tolerate a slow TLS handshake on a healthy server. Applied independently
|
||||
/// of <see cref="Timeout"/> which governs steady-state operations.
|
||||
/// </summary>
|
||||
public TimeSpan PerEndpointConnectTimeout { get; init; } = TimeSpan.FromSeconds(3);
|
||||
|
||||
/// <summary>
|
||||
/// Security policy to require when selecting an endpoint. Either a
|
||||
/// <see cref="OpcUaSecurityPolicy"/> enum constant or a free-form string (for
|
||||
/// forward-compatibility with future OPC UA policies not yet in the enum).
|
||||
/// Matched against <c>EndpointDescription.SecurityPolicyUri</c> suffix — the driver
|
||||
/// connects to the first endpoint whose policy name matches AND whose mode matches
|
||||
/// <see cref="SecurityMode"/>. When set to <see cref="OpcUaSecurityPolicy.None"/>
|
||||
/// the driver picks any unsecured endpoint regardless of policy string.
|
||||
/// </summary>
|
||||
public OpcUaSecurityPolicy SecurityPolicy { get; init; } = OpcUaSecurityPolicy.None;
|
||||
|
||||
/// <summary>Security mode.</summary>
|
||||
public OpcUaSecurityMode SecurityMode { get; init; } = OpcUaSecurityMode.None;
|
||||
|
||||
/// <summary>Authentication type.</summary>
|
||||
public OpcUaAuthType AuthType { get; init; } = OpcUaAuthType.Anonymous;
|
||||
|
||||
/// <summary>User name (required only for <see cref="OpcUaAuthType.Username"/>).</summary>
|
||||
public string? Username { get; init; }
|
||||
|
||||
/// <summary>Password (required only for <see cref="OpcUaAuthType.Username"/>).</summary>
|
||||
public string? Password { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Filesystem path to the user-identity certificate (PFX/PEM). Required when
|
||||
/// <see cref="AuthType"/> is <see cref="OpcUaAuthType.Certificate"/>. The driver
|
||||
/// loads the cert + private key, which the remote server validates against its
|
||||
/// <c>TrustedUserCertificates</c> store to authenticate the session's user token.
|
||||
/// Leave unset to use the driver's application-instance certificate as the user
|
||||
/// token (not typical — most deployments have a separate user cert).
|
||||
/// </summary>
|
||||
public string? UserCertificatePath { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Optional password that unlocks <see cref="UserCertificatePath"/> when the PFX is
|
||||
/// protected. PEM files generally have their password on the adjacent key file; this
|
||||
/// knob only applies to password-locked PFX.
|
||||
/// </summary>
|
||||
public string? UserCertificatePassword { get; init; }
|
||||
|
||||
/// <summary>Server-negotiated session timeout. Default 120s per driver-specs.md §8.</summary>
|
||||
public TimeSpan SessionTimeout { get; init; } = TimeSpan.FromSeconds(120);
|
||||
|
||||
/// <summary>Client-side keep-alive interval.</summary>
|
||||
public TimeSpan KeepAliveInterval { get; init; } = TimeSpan.FromSeconds(5);
|
||||
|
||||
/// <summary>Initial reconnect delay after a session drop.</summary>
|
||||
public TimeSpan ReconnectPeriod { get; init; } = TimeSpan.FromSeconds(5);
|
||||
|
||||
/// <summary>
|
||||
/// When <c>true</c>, the driver accepts any self-signed / untrusted server certificate.
|
||||
/// Dev-only — must be <c>false</c> in production so MITM attacks against the opc.tcp
|
||||
/// channel fail closed.
|
||||
/// </summary>
|
||||
public bool AutoAcceptCertificates { get; init; } = false;
|
||||
|
||||
/// <summary>
|
||||
/// Application URI the driver reports during session creation. Must match the
|
||||
/// subject-alt-name on the client certificate if one is used, which is why it's a
|
||||
/// config knob rather than hard-coded.
|
||||
/// </summary>
|
||||
public string ApplicationUri { get; init; } = "urn:localhost:OtOpcUa:GatewayClient";
|
||||
|
||||
/// <summary>
|
||||
/// Friendly name sent to the remote server for diagnostics. Shows up in the remote
|
||||
/// server's session-list so operators can identify which gateway instance is calling.
|
||||
/// </summary>
|
||||
public string SessionName { get; init; } = "OtOpcUa-Gateway";
|
||||
|
||||
/// <summary>Connect + per-operation timeout.</summary>
|
||||
public TimeSpan Timeout { get; init; } = TimeSpan.FromSeconds(10);
|
||||
|
||||
/// <summary>
|
||||
/// Root NodeId to mirror. Default <c>null</c> = <c>ObjectsFolder</c> (i=85). Set to
|
||||
/// a scoped root to restrict the address space the driver exposes locally — useful
|
||||
/// when the remote server has tens of thousands of nodes and only a subset is
|
||||
/// needed downstream.
|
||||
/// </summary>
|
||||
public string? BrowseRoot { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Cap on total nodes discovered during <c>DiscoverAsync</c>. Default 10_000 —
|
||||
/// bounds memory on runaway remote servers without being so low that normal
|
||||
/// deployments hit it. When the cap is reached discovery stops and a warning is
|
||||
/// written to the driver health surface; the partially-discovered tree is still
|
||||
/// projected into the local address space.
|
||||
/// </summary>
|
||||
public int MaxDiscoveredNodes { get; init; } = 10_000;
|
||||
|
||||
/// <summary>
|
||||
/// Max hierarchical depth of the browse. Default 10 — deep enough for realistic
|
||||
/// OPC UA information models, shallow enough that cyclic graphs can't spin the
|
||||
/// browse forever.
|
||||
/// </summary>
|
||||
public int MaxBrowseDepth { get; init; } = 10;
|
||||
}
|
||||
|
||||
/// <summary>OPC UA message security mode.</summary>
|
||||
public enum OpcUaSecurityMode
|
||||
{
|
||||
None,
|
||||
Sign,
|
||||
SignAndEncrypt,
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// OPC UA security policies recognized by the driver. Maps to the standard
|
||||
/// <c>http://opcfoundation.org/UA/SecurityPolicy#</c> URI suffixes the SDK uses for
|
||||
/// endpoint matching.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <see cref="Basic128Rsa15"/> and <see cref="Basic256"/> are <b>deprecated</b> per OPC UA
|
||||
/// spec v1.04 — they remain in the enum only for brownfield interop with older servers.
|
||||
/// Prefer <see cref="Basic256Sha256"/>, <see cref="Aes128_Sha256_RsaOaep"/>, or
|
||||
/// <see cref="Aes256_Sha256_RsaPss"/> for new deployments.
|
||||
/// </remarks>
|
||||
public enum OpcUaSecurityPolicy
|
||||
{
|
||||
/// <summary>No security. Unsigned, unencrypted wire.</summary>
|
||||
None,
|
||||
/// <summary>Deprecated (OPC UA 1.04). Retained for legacy server interop.</summary>
|
||||
Basic128Rsa15,
|
||||
/// <summary>Deprecated (OPC UA 1.04). Retained for legacy server interop.</summary>
|
||||
Basic256,
|
||||
/// <summary>Recommended baseline for current deployments.</summary>
|
||||
Basic256Sha256,
|
||||
/// <summary>Current OPC UA policy; AES-128 + SHA-256 + RSA-OAEP.</summary>
|
||||
Aes128_Sha256_RsaOaep,
|
||||
/// <summary>Current OPC UA policy; AES-256 + SHA-256 + RSA-PSS.</summary>
|
||||
Aes256_Sha256_RsaPss,
|
||||
}
|
||||
|
||||
/// <summary>User authentication type sent to the remote server.</summary>
|
||||
public enum OpcUaAuthType
|
||||
{
|
||||
Anonymous,
|
||||
Username,
|
||||
Certificate,
|
||||
}
|
||||
@@ -0,0 +1,28 @@
|
||||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
|
||||
<PropertyGroup>
|
||||
<TargetFramework>net10.0</TargetFramework>
|
||||
<Nullable>enable</Nullable>
|
||||
<ImplicitUsings>enable</ImplicitUsings>
|
||||
<LangVersion>latest</LangVersion>
|
||||
<TreatWarningsAsErrors>true</TreatWarningsAsErrors>
|
||||
<GenerateDocumentationFile>true</GenerateDocumentationFile>
|
||||
<NoWarn>$(NoWarn);CS1591</NoWarn>
|
||||
<RootNamespace>ZB.MOM.WW.OtOpcUa.Driver.OpcUaClient</RootNamespace>
|
||||
<AssemblyName>ZB.MOM.WW.OtOpcUa.Driver.OpcUaClient</AssemblyName>
|
||||
</PropertyGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<ProjectReference Include="..\ZB.MOM.WW.OtOpcUa.Core.Abstractions\ZB.MOM.WW.OtOpcUa.Core.Abstractions.csproj"/>
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<PackageReference Include="OPCFoundation.NetStandard.Opc.Ua.Client" Version="1.5.378.106"/>
|
||||
<PackageReference Include="OPCFoundation.NetStandard.Opc.Ua.Configuration" Version="1.5.378.106"/>
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<InternalsVisibleTo Include="ZB.MOM.WW.OtOpcUa.Driver.OpcUaClient.Tests"/>
|
||||
</ItemGroup>
|
||||
|
||||
</Project>
|
||||
216
src/ZB.MOM.WW.OtOpcUa.Driver.S7/S7AddressParser.cs
Normal file
216
src/ZB.MOM.WW.OtOpcUa.Driver.S7/S7AddressParser.cs
Normal file
@@ -0,0 +1,216 @@
|
||||
namespace ZB.MOM.WW.OtOpcUa.Driver.S7;
|
||||
|
||||
/// <summary>
|
||||
/// Siemens S7 memory area. The driver's tag-address parser maps every S7 tag string into
|
||||
/// exactly one of these + an offset. Values match the on-wire S7 area codes only
|
||||
/// incidentally — S7.Net uses its own <c>DataType</c> enum (<c>DataBlock</c>, <c>Memory</c>,
|
||||
/// <c>Input</c>, <c>Output</c>, <c>Timer</c>, <c>Counter</c>) so the adapter layer translates.
|
||||
/// </summary>
|
||||
public enum S7Area
|
||||
{
|
||||
DataBlock,
|
||||
Memory, // M (Merker / marker byte)
|
||||
Input, // I (process-image input)
|
||||
Output, // Q (process-image output)
|
||||
Timer,
|
||||
Counter,
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Access width for a DB / M / I / Q address. Timers and counters are always 16-bit
|
||||
/// opaque (not user-addressable via size suffixes).
|
||||
/// </summary>
|
||||
public enum S7Size
|
||||
{
|
||||
Bit, // X
|
||||
Byte, // B
|
||||
Word, // W — 16-bit
|
||||
DWord, // D — 32-bit
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Parsed form of an S7 tag-address string. Produced by <see cref="S7AddressParser.Parse"/>.
|
||||
/// </summary>
|
||||
/// <param name="Area">Memory area (DB, M, I, Q, T, C).</param>
|
||||
/// <param name="DbNumber">Data block number; only meaningful when <paramref name="Area"/> is <see cref="S7Area.DataBlock"/>.</param>
|
||||
/// <param name="Size">Access width. Always <see cref="S7Size.Word"/> for Timer and Counter.</param>
|
||||
/// <param name="ByteOffset">Byte offset into the area (for DB/M/I/Q) or the timer/counter number.</param>
|
||||
/// <param name="BitOffset">Bit position 0-7 when <paramref name="Size"/> is <see cref="S7Size.Bit"/>; 0 otherwise.</param>
|
||||
public readonly record struct S7ParsedAddress(
|
||||
S7Area Area,
|
||||
int DbNumber,
|
||||
S7Size Size,
|
||||
int ByteOffset,
|
||||
int BitOffset);
|
||||
|
||||
/// <summary>
|
||||
/// Parses Siemens S7 address strings into <see cref="S7ParsedAddress"/>. Accepts the
|
||||
/// Siemens TIA-Portal / STEP 7 Classic syntax documented in <c>docs/v2/driver-specs.md</c> §5:
|
||||
/// <list type="bullet">
|
||||
/// <item><c>DB{n}.DB{X|B|W|D}{offset}[.bit]</c> — e.g. <c>DB1.DBX0.0</c>, <c>DB1.DBW0</c>, <c>DB1.DBD4</c></item>
|
||||
/// <item><c>M{B|W|D}{offset}</c> or <c>M{offset}.{bit}</c> — e.g. <c>MB0</c>, <c>MW0</c>, <c>MD4</c>, <c>M0.0</c></item>
|
||||
/// <item><c>I{B|W|D}{offset}</c> or <c>I{offset}.{bit}</c> — e.g. <c>IB0</c>, <c>IW0</c>, <c>ID0</c>, <c>I0.0</c></item>
|
||||
/// <item><c>Q{B|W|D}{offset}</c> or <c>Q{offset}.{bit}</c> — e.g. <c>QB0</c>, <c>QW0</c>, <c>QD0</c>, <c>Q0.0</c></item>
|
||||
/// <item><c>T{n}</c> — e.g. <c>T0</c>, <c>T15</c></item>
|
||||
/// <item><c>C{n}</c> — e.g. <c>C0</c>, <c>C10</c></item>
|
||||
/// </list>
|
||||
/// Grammar is case-insensitive. Leading/trailing whitespace tolerated. Bit specifiers
|
||||
/// must be 0-7; byte offsets must be non-negative; DB numbers must be >= 1.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// Parse is deliberately strict — the parser rejects syntactic garbage up-front so a bad
|
||||
/// tag config fails at driver init time instead of surfacing as a misleading
|
||||
/// <c>BadInternalError</c> on every Read against that tag.
|
||||
/// </remarks>
|
||||
public static class S7AddressParser
|
||||
{
|
||||
/// <summary>
|
||||
/// Parse an S7 address. Throws <see cref="FormatException"/> on any syntax error with
|
||||
/// the offending input echoed in the message so operators can correlate to the tag
|
||||
/// config that produced the fault.
|
||||
/// </summary>
|
||||
public static S7ParsedAddress Parse(string address)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(address))
|
||||
throw new FormatException("S7 address must not be empty");
|
||||
var s = address.Trim().ToUpperInvariant();
|
||||
|
||||
// --- DB{n}.DB{X|B|W|D}{offset}[.bit] ---
|
||||
if (s.StartsWith("DB") && TryParseDataBlock(s, out var dbResult))
|
||||
return dbResult;
|
||||
|
||||
if (s.Length < 2)
|
||||
throw new FormatException($"S7 address '{address}' is too short to parse");
|
||||
|
||||
var areaChar = s[0];
|
||||
var rest = s.Substring(1);
|
||||
|
||||
switch (areaChar)
|
||||
{
|
||||
case 'M': return ParseMIQ(S7Area.Memory, rest, address);
|
||||
case 'I': return ParseMIQ(S7Area.Input, rest, address);
|
||||
case 'Q': return ParseMIQ(S7Area.Output, rest, address);
|
||||
case 'T': return ParseTimerOrCounter(S7Area.Timer, rest, address);
|
||||
case 'C': return ParseTimerOrCounter(S7Area.Counter, rest, address);
|
||||
default:
|
||||
throw new FormatException($"S7 address '{address}' starts with unknown area '{areaChar}' (expected DB/M/I/Q/T/C)");
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Try-parse variant for callers that can't afford an exception on bad input (e.g.
|
||||
/// config validation pages in the Admin UI). Returns <c>false</c> for any input that
|
||||
/// would throw from <see cref="Parse"/>.
|
||||
/// </summary>
|
||||
public static bool TryParse(string address, out S7ParsedAddress result)
|
||||
{
|
||||
try
|
||||
{
|
||||
result = Parse(address);
|
||||
return true;
|
||||
}
|
||||
catch (FormatException)
|
||||
{
|
||||
result = default;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
private static bool TryParseDataBlock(string s, out S7ParsedAddress result)
|
||||
{
|
||||
result = default;
|
||||
// Split on first '.': left side must be DB{n}, right side DB{X|B|W|D}{offset}[.bit]
|
||||
var dot = s.IndexOf('.');
|
||||
if (dot < 0) return false;
|
||||
var head = s.Substring(0, dot); // DB{n}
|
||||
var tail = s.Substring(dot + 1); // DB{X|B|W|D}{offset}[.bit]
|
||||
|
||||
if (head.Length < 3) return false;
|
||||
if (!int.TryParse(head.AsSpan(2), out var dbNumber) || dbNumber < 1)
|
||||
throw new FormatException($"S7 DB number in '{s}' must be a positive integer");
|
||||
|
||||
if (!tail.StartsWith("DB") || tail.Length < 4)
|
||||
throw new FormatException($"S7 DB address tail '{tail}' must start with DB{{X|B|W|D}}");
|
||||
|
||||
var sizeChar = tail[2];
|
||||
var offsetStart = 3;
|
||||
var size = sizeChar switch
|
||||
{
|
||||
'X' => S7Size.Bit,
|
||||
'B' => S7Size.Byte,
|
||||
'W' => S7Size.Word,
|
||||
'D' => S7Size.DWord,
|
||||
_ => throw new FormatException($"S7 DB size '{sizeChar}' in '{s}' must be X/B/W/D"),
|
||||
};
|
||||
|
||||
var (byteOffset, bitOffset) = ParseOffsetAndOptionalBit(tail, offsetStart, size, s);
|
||||
result = new S7ParsedAddress(S7Area.DataBlock, dbNumber, size, byteOffset, bitOffset);
|
||||
return true;
|
||||
}
|
||||
|
||||
private static S7ParsedAddress ParseMIQ(S7Area area, string rest, string original)
|
||||
{
|
||||
if (rest.Length == 0)
|
||||
throw new FormatException($"S7 address '{original}' has no offset");
|
||||
|
||||
var first = rest[0];
|
||||
S7Size size;
|
||||
int offsetStart;
|
||||
switch (first)
|
||||
{
|
||||
case 'B': size = S7Size.Byte; offsetStart = 1; break;
|
||||
case 'W': size = S7Size.Word; offsetStart = 1; break;
|
||||
case 'D': size = S7Size.DWord; offsetStart = 1; break;
|
||||
default:
|
||||
// No size prefix => bit-level address requires explicit .bit. Size stays Bit;
|
||||
// ParseOffsetAndOptionalBit will demand the dot.
|
||||
size = S7Size.Bit;
|
||||
offsetStart = 0;
|
||||
break;
|
||||
}
|
||||
|
||||
var (byteOffset, bitOffset) = ParseOffsetAndOptionalBit(rest, offsetStart, size, original);
|
||||
return new S7ParsedAddress(area, DbNumber: 0, size, byteOffset, bitOffset);
|
||||
}
|
||||
|
||||
private static S7ParsedAddress ParseTimerOrCounter(S7Area area, string rest, string original)
|
||||
{
|
||||
if (rest.Length == 0)
|
||||
throw new FormatException($"S7 address '{original}' has no {area} number");
|
||||
if (!int.TryParse(rest, out var number) || number < 0)
|
||||
throw new FormatException($"S7 {area} number in '{original}' must be a non-negative integer");
|
||||
return new S7ParsedAddress(area, DbNumber: 0, S7Size.Word, number, BitOffset: 0);
|
||||
}
|
||||
|
||||
private static (int byteOffset, int bitOffset) ParseOffsetAndOptionalBit(
|
||||
string s, int start, S7Size size, string original)
|
||||
{
|
||||
var offsetEnd = start;
|
||||
while (offsetEnd < s.Length && s[offsetEnd] >= '0' && s[offsetEnd] <= '9')
|
||||
offsetEnd++;
|
||||
if (offsetEnd == start)
|
||||
throw new FormatException($"S7 address '{original}' has no byte-offset digits");
|
||||
|
||||
if (!int.TryParse(s.AsSpan(start, offsetEnd - start), out var byteOffset) || byteOffset < 0)
|
||||
throw new FormatException($"S7 byte offset in '{original}' must be non-negative");
|
||||
|
||||
// No bit-suffix: done unless size is Bit with no prefix, which requires one.
|
||||
if (offsetEnd == s.Length)
|
||||
{
|
||||
if (size == S7Size.Bit)
|
||||
throw new FormatException($"S7 address '{original}' needs a .{{bit}} suffix for bit access");
|
||||
return (byteOffset, 0);
|
||||
}
|
||||
|
||||
if (s[offsetEnd] != '.')
|
||||
throw new FormatException($"S7 address '{original}' has unexpected character after offset");
|
||||
|
||||
if (size != S7Size.Bit)
|
||||
throw new FormatException($"S7 address '{original}' has a bit suffix but the size is {size} — bit access needs X (DB) or no size prefix (M/I/Q)");
|
||||
|
||||
if (!int.TryParse(s.AsSpan(offsetEnd + 1), out var bitOffset) || bitOffset is < 0 or > 7)
|
||||
throw new FormatException($"S7 bit offset in '{original}' must be 0-7");
|
||||
|
||||
return (byteOffset, bitOffset);
|
||||
}
|
||||
}
|
||||
514
src/ZB.MOM.WW.OtOpcUa.Driver.S7/S7Driver.cs
Normal file
514
src/ZB.MOM.WW.OtOpcUa.Driver.S7/S7Driver.cs
Normal file
@@ -0,0 +1,514 @@
|
||||
using S7.Net;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Driver.S7;
|
||||
|
||||
/// <summary>
|
||||
/// Siemens S7 native driver — speaks S7comm over ISO-on-TCP (port 102) via the S7netplus
|
||||
/// library. First implementation of <see cref="IDriver"/> for an in-process .NET Standard
|
||||
/// PLC protocol that is NOT Modbus, validating that the v2 driver-capability interfaces
|
||||
/// generalize beyond Modbus + Galaxy.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// PR 62 ships the scaffold: <see cref="IDriver"/> only (Initialize / Reinitialize /
|
||||
/// Shutdown / GetHealth). <see cref="ITagDiscovery"/>, <see cref="IReadable"/>,
|
||||
/// <see cref="IWritable"/>, <see cref="ISubscribable"/>, <see cref="IHostConnectivityProbe"/>
|
||||
/// land in PRs 63-65 once the address parser (PR 63) is in place.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// <b>Single-connection policy</b>: S7netplus documented pattern is one
|
||||
/// <c>Plc</c> instance per PLC, serialized with a <see cref="SemaphoreSlim"/>.
|
||||
/// Parallelising reads against a single S7 CPU doesn't help — the CPU scans the
|
||||
/// communication mailbox at most once per cycle (2-10 ms) and queues concurrent
|
||||
/// requests wire-side anyway. Multiple client-side connections just waste the CPU's
|
||||
/// 8-64 connection-resource budget.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public sealed class S7Driver(S7DriverOptions options, string driverInstanceId)
|
||||
: IDriver, ITagDiscovery, IReadable, IWritable, ISubscribable, IHostConnectivityProbe, IDisposable, IAsyncDisposable
|
||||
{
|
||||
// ---- ISubscribable + IHostConnectivityProbe state ----
|
||||
|
||||
private readonly System.Collections.Concurrent.ConcurrentDictionary<long, SubscriptionState> _subscriptions = new();
|
||||
private long _nextSubscriptionId;
|
||||
private readonly object _probeLock = new();
|
||||
private HostState _hostState = HostState.Unknown;
|
||||
private DateTime _hostStateChangedUtc = DateTime.UtcNow;
|
||||
private CancellationTokenSource? _probeCts;
|
||||
|
||||
public event EventHandler<DataChangeEventArgs>? OnDataChange;
|
||||
public event EventHandler<HostStatusChangedEventArgs>? OnHostStatusChanged;
|
||||
|
||||
/// <summary>OPC UA StatusCode used when the tag name isn't in the driver's tag map.</summary>
|
||||
private const uint StatusBadNodeIdUnknown = 0x80340000u;
|
||||
/// <summary>OPC UA StatusCode used when the tag's data type isn't implemented yet.</summary>
|
||||
private const uint StatusBadNotSupported = 0x803D0000u;
|
||||
/// <summary>OPC UA StatusCode used when the tag is declared read-only.</summary>
|
||||
private const uint StatusBadNotWritable = 0x803B0000u;
|
||||
/// <summary>OPC UA StatusCode used when write fails validation (e.g. out-of-range value).</summary>
|
||||
private const uint StatusBadInternalError = 0x80020000u;
|
||||
/// <summary>OPC UA StatusCode used for socket / timeout / protocol-layer faults.</summary>
|
||||
private const uint StatusBadCommunicationError = 0x80050000u;
|
||||
/// <summary>OPC UA StatusCode used when S7 returns <c>ErrorCode.WrongCPU</c> / PUT/GET disabled.</summary>
|
||||
private const uint StatusBadDeviceFailure = 0x80550000u;
|
||||
|
||||
private readonly Dictionary<string, S7TagDefinition> _tagsByName = new(StringComparer.OrdinalIgnoreCase);
|
||||
private readonly Dictionary<string, S7ParsedAddress> _parsedByName = new(StringComparer.OrdinalIgnoreCase);
|
||||
|
||||
private readonly S7DriverOptions _options = options;
|
||||
private readonly SemaphoreSlim _gate = new(1, 1);
|
||||
|
||||
/// <summary>
|
||||
/// Per-connection gate. Internal so PRs 63-65 (read/write/subscribe) can serialize on
|
||||
/// the same semaphore without exposing it publicly. Single-connection-per-PLC is a
|
||||
/// hard requirement of S7netplus — see class remarks.
|
||||
/// </summary>
|
||||
internal SemaphoreSlim Gate => _gate;
|
||||
|
||||
/// <summary>
|
||||
/// Active S7.Net PLC connection. Null until <see cref="InitializeAsync"/> returns; null
|
||||
/// after <see cref="ShutdownAsync"/>. Read-only outside this class; PR 64's Read/Write
|
||||
/// will take the <see cref="_gate"/> before touching it.
|
||||
/// </summary>
|
||||
internal Plc? Plc { get; private set; }
|
||||
|
||||
private DriverHealth _health = new(DriverState.Unknown, null, null);
|
||||
private bool _disposed;
|
||||
|
||||
public string DriverInstanceId => driverInstanceId;
|
||||
public string DriverType => "S7";
|
||||
|
||||
public async Task InitializeAsync(string driverConfigJson, CancellationToken cancellationToken)
|
||||
{
|
||||
_health = new DriverHealth(DriverState.Initializing, null, null);
|
||||
try
|
||||
{
|
||||
var plc = new Plc(_options.CpuType, _options.Host, _options.Rack, _options.Slot);
|
||||
// S7netplus writes timeouts into the underlying TcpClient via Plc.WriteTimeout /
|
||||
// Plc.ReadTimeout (milliseconds). Set before OpenAsync so the handshake itself
|
||||
// honours the bound.
|
||||
plc.WriteTimeout = (int)_options.Timeout.TotalMilliseconds;
|
||||
plc.ReadTimeout = (int)_options.Timeout.TotalMilliseconds;
|
||||
|
||||
using var cts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken);
|
||||
cts.CancelAfter(_options.Timeout);
|
||||
await plc.OpenAsync(cts.Token).ConfigureAwait(false);
|
||||
|
||||
Plc = plc;
|
||||
|
||||
// Parse every tag's address once at init so config typos fail fast here instead
|
||||
// of surfacing as BadInternalError on every Read against the bad tag. The parser
|
||||
// also rejects bit-offset > 7, DB 0, unknown area letters, etc.
|
||||
_tagsByName.Clear();
|
||||
_parsedByName.Clear();
|
||||
foreach (var t in _options.Tags)
|
||||
{
|
||||
var parsed = S7AddressParser.Parse(t.Address); // throws FormatException
|
||||
_tagsByName[t.Name] = t;
|
||||
_parsedByName[t.Name] = parsed;
|
||||
}
|
||||
|
||||
_health = new DriverHealth(DriverState.Healthy, DateTime.UtcNow, null);
|
||||
|
||||
// Kick off the probe loop once the connection is up. Initial HostState stays
|
||||
// Unknown until the first probe tick succeeds — avoids broadcasting a premature
|
||||
// Running transition before any PDU round-trip has happened.
|
||||
if (_options.Probe.Enabled)
|
||||
{
|
||||
_probeCts = new CancellationTokenSource();
|
||||
_ = Task.Run(() => ProbeLoopAsync(_probeCts.Token), _probeCts.Token);
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
// Clean up a partially-constructed Plc so a retry from the caller doesn't leak
|
||||
// the TcpClient. S7netplus's Close() is best-effort and idempotent.
|
||||
try { Plc?.Close(); } catch { }
|
||||
Plc = null;
|
||||
_health = new DriverHealth(DriverState.Faulted, null, ex.Message);
|
||||
throw;
|
||||
}
|
||||
}
|
||||
|
||||
public async Task ReinitializeAsync(string driverConfigJson, CancellationToken cancellationToken)
|
||||
{
|
||||
await ShutdownAsync(cancellationToken).ConfigureAwait(false);
|
||||
await InitializeAsync(driverConfigJson, cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
public Task ShutdownAsync(CancellationToken cancellationToken)
|
||||
{
|
||||
try { _probeCts?.Cancel(); } catch { }
|
||||
_probeCts?.Dispose();
|
||||
_probeCts = null;
|
||||
|
||||
foreach (var state in _subscriptions.Values)
|
||||
{
|
||||
try { state.Cts.Cancel(); } catch { }
|
||||
state.Cts.Dispose();
|
||||
}
|
||||
_subscriptions.Clear();
|
||||
|
||||
try { Plc?.Close(); } catch { /* best-effort — tearing down anyway */ }
|
||||
Plc = null;
|
||||
_health = new DriverHealth(DriverState.Unknown, _health.LastSuccessfulRead, null);
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
|
||||
public DriverHealth GetHealth() => _health;
|
||||
|
||||
/// <summary>
|
||||
/// Approximate memory footprint. The Plc instance + one 240-960 byte PDU buffer is
|
||||
/// under 4 KB; return 0 because the <see cref="IDriver"/> contract asks for a
|
||||
/// driver-attributable growth number and S7.Net doesn't expose one.
|
||||
/// </summary>
|
||||
public long GetMemoryFootprint() => 0;
|
||||
|
||||
public Task FlushOptionalCachesAsync(CancellationToken cancellationToken) => Task.CompletedTask;
|
||||
|
||||
// ---- IReadable ----
|
||||
|
||||
public async Task<IReadOnlyList<DataValueSnapshot>> ReadAsync(
|
||||
IReadOnlyList<string> fullReferences, CancellationToken cancellationToken)
|
||||
{
|
||||
var plc = RequirePlc();
|
||||
var now = DateTime.UtcNow;
|
||||
var results = new DataValueSnapshot[fullReferences.Count];
|
||||
|
||||
await _gate.WaitAsync(cancellationToken).ConfigureAwait(false);
|
||||
try
|
||||
{
|
||||
for (var i = 0; i < fullReferences.Count; i++)
|
||||
{
|
||||
var name = fullReferences[i];
|
||||
if (!_tagsByName.TryGetValue(name, out var tag))
|
||||
{
|
||||
results[i] = new DataValueSnapshot(null, StatusBadNodeIdUnknown, null, now);
|
||||
continue;
|
||||
}
|
||||
try
|
||||
{
|
||||
var value = await ReadOneAsync(plc, tag, cancellationToken).ConfigureAwait(false);
|
||||
results[i] = new DataValueSnapshot(value, 0u, now, now);
|
||||
_health = new DriverHealth(DriverState.Healthy, now, null);
|
||||
}
|
||||
catch (NotSupportedException)
|
||||
{
|
||||
results[i] = new DataValueSnapshot(null, StatusBadNotSupported, null, now);
|
||||
}
|
||||
catch (global::S7.Net.PlcException pex)
|
||||
{
|
||||
// S7.Net's PlcException carries an ErrorCode; PUT/GET-disabled on
|
||||
// S7-1200/1500 surfaces here. Map to BadDeviceFailure so operators see a
|
||||
// device-config problem (toggle PUT/GET in TIA Portal) rather than a
|
||||
// transient fault — per driver-specs.md §5.
|
||||
results[i] = new DataValueSnapshot(null, StatusBadDeviceFailure, null, now);
|
||||
_health = new DriverHealth(DriverState.Degraded, _health.LastSuccessfulRead, pex.Message);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
results[i] = new DataValueSnapshot(null, StatusBadCommunicationError, null, now);
|
||||
_health = new DriverHealth(DriverState.Degraded, _health.LastSuccessfulRead, ex.Message);
|
||||
}
|
||||
}
|
||||
}
|
||||
finally { _gate.Release(); }
|
||||
return results;
|
||||
}
|
||||
|
||||
private async Task<object> ReadOneAsync(global::S7.Net.Plc plc, S7TagDefinition tag, CancellationToken ct)
|
||||
{
|
||||
var addr = _parsedByName[tag.Name];
|
||||
// S7.Net's string-based ReadAsync returns object where the boxed .NET type depends on
|
||||
// the size suffix: DBX=bool, DBB=byte, DBW=ushort, DBD=uint. Our S7DataType enum
|
||||
// specifies the SEMANTIC type (Int16 vs UInt16 vs Float32 etc.); the reinterpret below
|
||||
// converts the raw unsigned boxed value into the requested type without issuing an
|
||||
// extra PLC round-trip.
|
||||
var raw = await plc.ReadAsync(tag.Address, ct).ConfigureAwait(false)
|
||||
?? throw new System.IO.InvalidDataException($"S7.Net returned null for '{tag.Address}'");
|
||||
|
||||
return (tag.DataType, addr.Size, raw) switch
|
||||
{
|
||||
(S7DataType.Bool, S7Size.Bit, bool b) => b,
|
||||
(S7DataType.Byte, S7Size.Byte, byte by) => by,
|
||||
(S7DataType.UInt16, S7Size.Word, ushort u16) => u16,
|
||||
(S7DataType.Int16, S7Size.Word, ushort u16) => unchecked((short)u16),
|
||||
(S7DataType.UInt32, S7Size.DWord, uint u32) => u32,
|
||||
(S7DataType.Int32, S7Size.DWord, uint u32) => unchecked((int)u32),
|
||||
(S7DataType.Float32, S7Size.DWord, uint u32) => BitConverter.UInt32BitsToSingle(u32),
|
||||
|
||||
(S7DataType.Int64, _, _) => throw new NotSupportedException("S7 Int64 reads land in a follow-up PR"),
|
||||
(S7DataType.UInt64, _, _) => throw new NotSupportedException("S7 UInt64 reads land in a follow-up PR"),
|
||||
(S7DataType.Float64, _, _) => throw new NotSupportedException("S7 Float64 (LReal) reads land in a follow-up PR"),
|
||||
(S7DataType.String, _, _) => throw new NotSupportedException("S7 STRING reads land in a follow-up PR"),
|
||||
(S7DataType.DateTime, _, _) => throw new NotSupportedException("S7 DateTime reads land in a follow-up PR"),
|
||||
|
||||
_ => throw new System.IO.InvalidDataException(
|
||||
$"S7 Read type-mismatch: tag '{tag.Name}' declared {tag.DataType} but address '{tag.Address}' " +
|
||||
$"parsed as Size={addr.Size}; S7.Net returned {raw.GetType().Name}"),
|
||||
};
|
||||
}
|
||||
|
||||
// ---- IWritable ----
|
||||
|
||||
public async Task<IReadOnlyList<WriteResult>> WriteAsync(
|
||||
IReadOnlyList<WriteRequest> writes, CancellationToken cancellationToken)
|
||||
{
|
||||
var plc = RequirePlc();
|
||||
var results = new WriteResult[writes.Count];
|
||||
|
||||
await _gate.WaitAsync(cancellationToken).ConfigureAwait(false);
|
||||
try
|
||||
{
|
||||
for (var i = 0; i < writes.Count; i++)
|
||||
{
|
||||
var w = writes[i];
|
||||
if (!_tagsByName.TryGetValue(w.FullReference, out var tag))
|
||||
{
|
||||
results[i] = new WriteResult(StatusBadNodeIdUnknown);
|
||||
continue;
|
||||
}
|
||||
if (!tag.Writable)
|
||||
{
|
||||
results[i] = new WriteResult(StatusBadNotWritable);
|
||||
continue;
|
||||
}
|
||||
try
|
||||
{
|
||||
await WriteOneAsync(plc, tag, w.Value, cancellationToken).ConfigureAwait(false);
|
||||
results[i] = new WriteResult(0u);
|
||||
}
|
||||
catch (NotSupportedException)
|
||||
{
|
||||
results[i] = new WriteResult(StatusBadNotSupported);
|
||||
}
|
||||
catch (global::S7.Net.PlcException)
|
||||
{
|
||||
results[i] = new WriteResult(StatusBadDeviceFailure);
|
||||
}
|
||||
catch (Exception)
|
||||
{
|
||||
results[i] = new WriteResult(StatusBadInternalError);
|
||||
}
|
||||
}
|
||||
}
|
||||
finally { _gate.Release(); }
|
||||
return results;
|
||||
}
|
||||
|
||||
private async Task WriteOneAsync(global::S7.Net.Plc plc, S7TagDefinition tag, object? value, CancellationToken ct)
|
||||
{
|
||||
// S7.Net's Plc.WriteAsync(string address, object value) expects the boxed value to
|
||||
// match the address's size-suffix type: DBX=bool, DBB=byte, DBW=ushort, DBD=uint.
|
||||
// Our S7DataType lets the caller pass short/int/float; convert to the unsigned
|
||||
// wire representation before handing off.
|
||||
var boxed = tag.DataType switch
|
||||
{
|
||||
S7DataType.Bool => (object)Convert.ToBoolean(value),
|
||||
S7DataType.Byte => (object)Convert.ToByte(value),
|
||||
S7DataType.UInt16 => (object)Convert.ToUInt16(value),
|
||||
S7DataType.Int16 => (object)unchecked((ushort)Convert.ToInt16(value)),
|
||||
S7DataType.UInt32 => (object)Convert.ToUInt32(value),
|
||||
S7DataType.Int32 => (object)unchecked((uint)Convert.ToInt32(value)),
|
||||
S7DataType.Float32 => (object)BitConverter.SingleToUInt32Bits(Convert.ToSingle(value)),
|
||||
|
||||
S7DataType.Int64 => throw new NotSupportedException("S7 Int64 writes land in a follow-up PR"),
|
||||
S7DataType.UInt64 => throw new NotSupportedException("S7 UInt64 writes land in a follow-up PR"),
|
||||
S7DataType.Float64 => throw new NotSupportedException("S7 Float64 (LReal) writes land in a follow-up PR"),
|
||||
S7DataType.String => throw new NotSupportedException("S7 STRING writes land in a follow-up PR"),
|
||||
S7DataType.DateTime => throw new NotSupportedException("S7 DateTime writes land in a follow-up PR"),
|
||||
_ => throw new InvalidOperationException($"Unknown S7DataType {tag.DataType}"),
|
||||
};
|
||||
await plc.WriteAsync(tag.Address, boxed, ct).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
private global::S7.Net.Plc RequirePlc() =>
|
||||
Plc ?? throw new InvalidOperationException("S7Driver not initialized");
|
||||
|
||||
// ---- ITagDiscovery ----
|
||||
|
||||
public Task DiscoverAsync(IAddressSpaceBuilder builder, CancellationToken cancellationToken)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(builder);
|
||||
var folder = builder.Folder("S7", "S7");
|
||||
foreach (var t in _options.Tags)
|
||||
{
|
||||
folder.Variable(t.Name, t.Name, new DriverAttributeInfo(
|
||||
FullName: t.Name,
|
||||
DriverDataType: MapDataType(t.DataType),
|
||||
IsArray: false,
|
||||
ArrayDim: null,
|
||||
SecurityClass: t.Writable ? SecurityClassification.Operate : SecurityClassification.ViewOnly,
|
||||
IsHistorized: false,
|
||||
IsAlarm: false,
|
||||
WriteIdempotent: t.WriteIdempotent));
|
||||
}
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
|
||||
private static DriverDataType MapDataType(S7DataType t) => t switch
|
||||
{
|
||||
S7DataType.Bool => DriverDataType.Boolean,
|
||||
S7DataType.Byte => DriverDataType.Int32, // no 8-bit in DriverDataType yet
|
||||
S7DataType.Int16 or S7DataType.UInt16 or S7DataType.Int32 or S7DataType.UInt32 => DriverDataType.Int32,
|
||||
S7DataType.Int64 or S7DataType.UInt64 => DriverDataType.Int32, // widens; lossy for >2^31-1
|
||||
S7DataType.Float32 => DriverDataType.Float32,
|
||||
S7DataType.Float64 => DriverDataType.Float64,
|
||||
S7DataType.String => DriverDataType.String,
|
||||
S7DataType.DateTime => DriverDataType.DateTime,
|
||||
_ => DriverDataType.Int32,
|
||||
};
|
||||
|
||||
// ---- ISubscribable (polling overlay) ----
|
||||
|
||||
public Task<ISubscriptionHandle> SubscribeAsync(
|
||||
IReadOnlyList<string> fullReferences, TimeSpan publishingInterval, CancellationToken cancellationToken)
|
||||
{
|
||||
var id = Interlocked.Increment(ref _nextSubscriptionId);
|
||||
var cts = new CancellationTokenSource();
|
||||
// Floor at 100 ms — S7 CPUs scan 2-10 ms but the comms mailbox is processed at most
|
||||
// once per scan; sub-100 ms polling just queues wire-side with worse latency.
|
||||
var interval = publishingInterval < TimeSpan.FromMilliseconds(100)
|
||||
? TimeSpan.FromMilliseconds(100)
|
||||
: publishingInterval;
|
||||
var handle = new S7SubscriptionHandle(id);
|
||||
var state = new SubscriptionState(handle, [.. fullReferences], interval, cts);
|
||||
_subscriptions[id] = state;
|
||||
_ = Task.Run(() => PollLoopAsync(state, cts.Token), cts.Token);
|
||||
return Task.FromResult<ISubscriptionHandle>(handle);
|
||||
}
|
||||
|
||||
public Task UnsubscribeAsync(ISubscriptionHandle handle, CancellationToken cancellationToken)
|
||||
{
|
||||
if (handle is S7SubscriptionHandle h && _subscriptions.TryRemove(h.Id, out var state))
|
||||
{
|
||||
state.Cts.Cancel();
|
||||
state.Cts.Dispose();
|
||||
}
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
|
||||
private async Task PollLoopAsync(SubscriptionState state, CancellationToken ct)
|
||||
{
|
||||
// Initial-data push per OPC UA Part 4 convention.
|
||||
try { await PollOnceAsync(state, forceRaise: true, ct).ConfigureAwait(false); }
|
||||
catch (OperationCanceledException) { return; }
|
||||
catch { /* first-read error — polling continues */ }
|
||||
|
||||
while (!ct.IsCancellationRequested)
|
||||
{
|
||||
try { await Task.Delay(state.Interval, ct).ConfigureAwait(false); }
|
||||
catch (OperationCanceledException) { return; }
|
||||
|
||||
try { await PollOnceAsync(state, forceRaise: false, ct).ConfigureAwait(false); }
|
||||
catch (OperationCanceledException) { return; }
|
||||
catch { /* transient polling error — loop continues, health surface reflects it */ }
|
||||
}
|
||||
}
|
||||
|
||||
private async Task PollOnceAsync(SubscriptionState state, bool forceRaise, CancellationToken ct)
|
||||
{
|
||||
var snapshots = await ReadAsync(state.TagReferences, ct).ConfigureAwait(false);
|
||||
for (var i = 0; i < state.TagReferences.Count; i++)
|
||||
{
|
||||
var tagRef = state.TagReferences[i];
|
||||
var current = snapshots[i];
|
||||
var lastSeen = state.LastValues.TryGetValue(tagRef, out var prev) ? prev : default;
|
||||
|
||||
if (forceRaise || !Equals(lastSeen?.Value, current.Value) || lastSeen?.StatusCode != current.StatusCode)
|
||||
{
|
||||
state.LastValues[tagRef] = current;
|
||||
OnDataChange?.Invoke(this, new DataChangeEventArgs(state.Handle, tagRef, current));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private sealed record SubscriptionState(
|
||||
S7SubscriptionHandle Handle,
|
||||
IReadOnlyList<string> TagReferences,
|
||||
TimeSpan Interval,
|
||||
CancellationTokenSource Cts)
|
||||
{
|
||||
public System.Collections.Concurrent.ConcurrentDictionary<string, DataValueSnapshot> LastValues { get; }
|
||||
= new(StringComparer.OrdinalIgnoreCase);
|
||||
}
|
||||
|
||||
private sealed record S7SubscriptionHandle(long Id) : ISubscriptionHandle
|
||||
{
|
||||
public string DiagnosticId => $"s7-sub-{Id}";
|
||||
}
|
||||
|
||||
// ---- IHostConnectivityProbe ----
|
||||
|
||||
/// <summary>
|
||||
/// Host identifier surfaced in <see cref="GetHostStatuses"/>. <c>host:port</c> format
|
||||
/// matches the Modbus driver's convention so the Admin UI dashboard renders both
|
||||
/// family's rows uniformly.
|
||||
/// </summary>
|
||||
public string HostName => $"{_options.Host}:{_options.Port}";
|
||||
|
||||
public IReadOnlyList<HostConnectivityStatus> GetHostStatuses()
|
||||
{
|
||||
lock (_probeLock)
|
||||
return [new HostConnectivityStatus(HostName, _hostState, _hostStateChangedUtc)];
|
||||
}
|
||||
|
||||
private async Task ProbeLoopAsync(CancellationToken ct)
|
||||
{
|
||||
while (!ct.IsCancellationRequested)
|
||||
{
|
||||
var success = false;
|
||||
try
|
||||
{
|
||||
// Probe via S7.Net's low-cost GetCpuStatus — returns the CPU state (Run/Stop)
|
||||
// and is intentionally light on the comms mailbox. Single-word Plc.ReadAsync
|
||||
// would also work but GetCpuStatus doubles as a "PLC actually up" check.
|
||||
using var probeCts = CancellationTokenSource.CreateLinkedTokenSource(ct);
|
||||
probeCts.CancelAfter(_options.Probe.Timeout);
|
||||
|
||||
var plc = Plc;
|
||||
if (plc is null) throw new InvalidOperationException("Plc dropped during probe");
|
||||
|
||||
await _gate.WaitAsync(probeCts.Token).ConfigureAwait(false);
|
||||
try
|
||||
{
|
||||
_ = await plc.ReadStatusAsync(probeCts.Token).ConfigureAwait(false);
|
||||
success = true;
|
||||
}
|
||||
finally { _gate.Release(); }
|
||||
}
|
||||
catch (OperationCanceledException) when (ct.IsCancellationRequested) { return; }
|
||||
catch { /* transport/timeout/exception — treated as Stopped below */ }
|
||||
|
||||
TransitionTo(success ? HostState.Running : HostState.Stopped);
|
||||
|
||||
try { await Task.Delay(_options.Probe.Interval, ct).ConfigureAwait(false); }
|
||||
catch (OperationCanceledException) { return; }
|
||||
}
|
||||
}
|
||||
|
||||
private void TransitionTo(HostState newState)
|
||||
{
|
||||
HostState old;
|
||||
lock (_probeLock)
|
||||
{
|
||||
old = _hostState;
|
||||
if (old == newState) return;
|
||||
_hostState = newState;
|
||||
_hostStateChangedUtc = DateTime.UtcNow;
|
||||
}
|
||||
OnHostStatusChanged?.Invoke(this, new HostStatusChangedEventArgs(HostName, old, newState));
|
||||
}
|
||||
|
||||
public void Dispose() => DisposeAsync().AsTask().GetAwaiter().GetResult();
|
||||
|
||||
public async ValueTask DisposeAsync()
|
||||
{
|
||||
if (_disposed) return;
|
||||
_disposed = true;
|
||||
try { await ShutdownAsync(CancellationToken.None).ConfigureAwait(false); }
|
||||
catch { /* disposal is best-effort */ }
|
||||
_gate.Dispose();
|
||||
}
|
||||
}
|
||||
120
src/ZB.MOM.WW.OtOpcUa.Driver.S7/S7DriverOptions.cs
Normal file
120
src/ZB.MOM.WW.OtOpcUa.Driver.S7/S7DriverOptions.cs
Normal file
@@ -0,0 +1,120 @@
|
||||
using S7NetCpuType = global::S7.Net.CpuType;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Driver.S7;
|
||||
|
||||
/// <summary>
|
||||
/// Siemens S7 native (S7comm / ISO-on-TCP port 102) driver configuration. Bound from the
|
||||
/// driver's <c>DriverConfig</c> JSON at <c>DriverHost.RegisterAsync</c>. Unlike the Modbus
|
||||
/// driver the S7 driver uses the PLC's *native* protocol — port 102 ISO-on-TCP rather
|
||||
/// than Modbus's 502, and S7-specific area codes (DB, M, I, Q) rather than holding-
|
||||
/// register / coil tables.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// The driver requires <b>PUT/GET communication enabled</b> in the TIA Portal
|
||||
/// hardware config for S7-1200/1500. The factory default disables PUT/GET access,
|
||||
/// so a driver configured against a freshly-flashed CPU will see a hard error
|
||||
/// (S7.Net surfaces it as <c>Plc.ReadAsync</c> returning <c>ErrorCode.Accessing</c>).
|
||||
/// The driver maps that specifically to <c>BadNotSupported</c> and flags it as a
|
||||
/// configuration alert rather than a transient fault — blind Polly retry is wasted
|
||||
/// effort when the PLC will keep refusing every request.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// See <c>docs/v2/driver-specs.md</c> §5 for the full specification.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public sealed class S7DriverOptions
|
||||
{
|
||||
/// <summary>PLC IP address or hostname.</summary>
|
||||
public string Host { get; init; } = "127.0.0.1";
|
||||
|
||||
/// <summary>TCP port. ISO-on-TCP is 102 on every S7 model; override only for unusual NAT setups.</summary>
|
||||
public int Port { get; init; } = 102;
|
||||
|
||||
/// <summary>
|
||||
/// CPU family. Determines the ISO-TSAP slot byte that S7.Net uses during connection
|
||||
/// setup — pick the family that matches the target PLC exactly.
|
||||
/// </summary>
|
||||
public S7NetCpuType CpuType { get; init; } = S7NetCpuType.S71500;
|
||||
|
||||
/// <summary>
|
||||
/// Hardware rack number. Almost always 0; relevant only for distributed S7-400 racks
|
||||
/// with multiple CPUs.
|
||||
/// </summary>
|
||||
public short Rack { get; init; } = 0;
|
||||
|
||||
/// <summary>
|
||||
/// CPU slot. Conventions per family: S7-300 = slot 2, S7-400 = slot 2 or 3,
|
||||
/// S7-1200 / S7-1500 = slot 0 (onboard PN). S7.Net uses this to build the remote
|
||||
/// TSAP. Wrong slot → connection refused during handshake.
|
||||
/// </summary>
|
||||
public short Slot { get; init; } = 0;
|
||||
|
||||
/// <summary>Connect + per-operation timeout.</summary>
|
||||
public TimeSpan Timeout { get; init; } = TimeSpan.FromSeconds(5);
|
||||
|
||||
/// <summary>Pre-declared tag map. S7 has a symbol-table protocol but S7.Net does not expose it, so the driver operates off a static tag list configured per-site. Address grammar documented in S7AddressParser (PR 63).</summary>
|
||||
public IReadOnlyList<S7TagDefinition> Tags { get; init; } = [];
|
||||
|
||||
/// <summary>
|
||||
/// Background connectivity-probe settings. When enabled, the driver runs a tick loop
|
||||
/// that issues a cheap read against <see cref="S7ProbeOptions.ProbeAddress"/> every
|
||||
/// <see cref="S7ProbeOptions.Interval"/> and raises <c>OnHostStatusChanged</c> on
|
||||
/// Running ↔ Stopped transitions.
|
||||
/// </summary>
|
||||
public S7ProbeOptions Probe { get; init; } = new();
|
||||
}
|
||||
|
||||
public sealed class S7ProbeOptions
|
||||
{
|
||||
public bool Enabled { get; init; } = true;
|
||||
public TimeSpan Interval { get; init; } = TimeSpan.FromSeconds(5);
|
||||
public TimeSpan Timeout { get; init; } = TimeSpan.FromSeconds(2);
|
||||
|
||||
/// <summary>
|
||||
/// Address to probe for liveness. DB1.DBW0 is the convention if the PLC project
|
||||
/// reserves a small fingerprint DB for health checks (per <c>docs/v2/s7.md</c>);
|
||||
/// if not, pick any valid Merker word like <c>MW0</c>.
|
||||
/// </summary>
|
||||
public string ProbeAddress { get; init; } = "MW0";
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// One S7 variable as exposed by the driver. Addresses use S7.Net syntax — see
|
||||
/// <c>S7AddressParser</c> (PR 63) for the grammar.
|
||||
/// </summary>
|
||||
/// <param name="Name">Tag name; OPC UA browse name + driver full reference.</param>
|
||||
/// <param name="Address">S7 address string, e.g. <c>DB1.DBW0</c>, <c>M0.0</c>, <c>I0.0</c>, <c>QD4</c>. Grammar documented in <c>S7AddressParser</c> (PR 63).</param>
|
||||
/// <param name="DataType">Logical data type — drives the underlying S7.Net read/write width.</param>
|
||||
/// <param name="Writable">When true the driver accepts writes for this tag.</param>
|
||||
/// <param name="StringLength">For <c>DataType = String</c>: S7-string max length. Default 254 (S7 max).</param>
|
||||
/// <param name="WriteIdempotent">
|
||||
/// Per <c>docs/v2/plan.md</c> decisions #44, #45, #143 — flag a tag as safe to replay on
|
||||
/// write timeout / failure. Default <c>false</c>; writes do not auto-retry. Safe candidates
|
||||
/// on S7: DB word/dword set-points holding analog values, configuration DBs where the same
|
||||
/// value can be written again without side-effects. Unsafe: M (merker) bits or Q (output)
|
||||
/// coils that drive edge-triggered routines in the PLC program.
|
||||
/// </param>
|
||||
public sealed record S7TagDefinition(
|
||||
string Name,
|
||||
string Address,
|
||||
S7DataType DataType,
|
||||
bool Writable = true,
|
||||
int StringLength = 254,
|
||||
bool WriteIdempotent = false);
|
||||
|
||||
public enum S7DataType
|
||||
{
|
||||
Bool,
|
||||
Byte,
|
||||
Int16,
|
||||
UInt16,
|
||||
Int32,
|
||||
UInt32,
|
||||
Int64,
|
||||
UInt64,
|
||||
Float32,
|
||||
Float64,
|
||||
String,
|
||||
DateTime,
|
||||
}
|
||||
@@ -0,0 +1,27 @@
|
||||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
|
||||
<PropertyGroup>
|
||||
<TargetFramework>net10.0</TargetFramework>
|
||||
<Nullable>enable</Nullable>
|
||||
<ImplicitUsings>enable</ImplicitUsings>
|
||||
<LangVersion>latest</LangVersion>
|
||||
<TreatWarningsAsErrors>true</TreatWarningsAsErrors>
|
||||
<GenerateDocumentationFile>true</GenerateDocumentationFile>
|
||||
<NoWarn>$(NoWarn);CS1591</NoWarn>
|
||||
<RootNamespace>ZB.MOM.WW.OtOpcUa.Driver.S7</RootNamespace>
|
||||
<AssemblyName>ZB.MOM.WW.OtOpcUa.Driver.S7</AssemblyName>
|
||||
</PropertyGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<ProjectReference Include="..\ZB.MOM.WW.OtOpcUa.Core.Abstractions\ZB.MOM.WW.OtOpcUa.Core.Abstractions.csproj"/>
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<PackageReference Include="S7netplus" Version="0.20.0"/>
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<InternalsVisibleTo Include="ZB.MOM.WW.OtOpcUa.Driver.S7.Tests"/>
|
||||
</ItemGroup>
|
||||
|
||||
</Project>
|
||||
143
src/ZB.MOM.WW.OtOpcUa.Server/HostStatusPublisher.cs
Normal file
143
src/ZB.MOM.WW.OtOpcUa.Server/HostStatusPublisher.cs
Normal file
@@ -0,0 +1,143 @@
|
||||
using Microsoft.EntityFrameworkCore;
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using Microsoft.Extensions.Hosting;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using ZB.MOM.WW.OtOpcUa.Configuration;
|
||||
using ZB.MOM.WW.OtOpcUa.Configuration.Entities;
|
||||
using ZB.MOM.WW.OtOpcUa.Configuration.Enums;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Hosting;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Server;
|
||||
|
||||
/// <summary>
|
||||
/// Walks every registered driver once per heartbeat interval, asks each
|
||||
/// <see cref="IHostConnectivityProbe"/>-capable driver for its current
|
||||
/// <see cref="HostConnectivityStatus"/> list, and upserts one
|
||||
/// <see cref="DriverHostStatus"/> row per (NodeId, DriverInstanceId, HostName) into the
|
||||
/// central config DB. Powers the Admin UI's per-host drill-down page (LMX follow-up #7).
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// Polling rather than event-driven: simpler, and matches the cadence the Admin UI
|
||||
/// consumes. An event-subscription optimization (push on <c>OnHostStatusChanged</c> for
|
||||
/// immediate reflection) is a straightforward follow-up but adds lifecycle complexity
|
||||
/// — drivers can be registered after the publisher starts, and subscribing to each
|
||||
/// one's event on register + unsubscribing on unregister requires DriverHost to expose
|
||||
/// lifecycle events it doesn't today.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// <see cref="DriverHostStatus.LastSeenUtc"/> advances every heartbeat so the Admin UI
|
||||
/// can flag stale rows from a crashed Server process independent of
|
||||
/// <see cref="DriverHostStatus.State"/> — a Faulted publisher that stops heartbeating
|
||||
/// stays Faulted in the DB but its LastSeenUtc ages out, which is the signal
|
||||
/// operators actually want.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// If the DB is unreachable on a given tick, the publisher logs and moves on — it
|
||||
/// does not retry or buffer. The next heartbeat picks up the current-state snapshot,
|
||||
/// which is more useful than replaying stale transitions after a long outage.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public sealed class HostStatusPublisher(
|
||||
DriverHost driverHost,
|
||||
NodeOptions nodeOptions,
|
||||
IServiceScopeFactory scopeFactory,
|
||||
ILogger<HostStatusPublisher> logger) : BackgroundService
|
||||
{
|
||||
internal static readonly TimeSpan HeartbeatInterval = TimeSpan.FromSeconds(10);
|
||||
|
||||
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
|
||||
{
|
||||
// Wait a short moment at startup so NodeBootstrap's RegisterAsync calls have had a
|
||||
// chance to land. First tick runs immediately after so a freshly-started Server
|
||||
// surfaces its host topology in the Admin UI without waiting a full interval.
|
||||
try { await Task.Delay(TimeSpan.FromSeconds(2), stoppingToken); }
|
||||
catch (OperationCanceledException) { return; }
|
||||
|
||||
while (!stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
try { await PublishOnceAsync(stoppingToken); }
|
||||
catch (OperationCanceledException) { return; }
|
||||
catch (Exception ex)
|
||||
{
|
||||
// Never take down the Server on a publisher failure. Log and continue —
|
||||
// stale-row detection on the Admin side will surface the outage.
|
||||
logger.LogWarning(ex, "Host-status publisher tick failed — will retry next heartbeat");
|
||||
}
|
||||
|
||||
try { await Task.Delay(HeartbeatInterval, stoppingToken); }
|
||||
catch (OperationCanceledException) { return; }
|
||||
}
|
||||
}
|
||||
|
||||
internal async Task PublishOnceAsync(CancellationToken ct)
|
||||
{
|
||||
var driverIds = driverHost.RegisteredDriverIds;
|
||||
if (driverIds.Count == 0) return;
|
||||
|
||||
var now = DateTime.UtcNow;
|
||||
using var scope = scopeFactory.CreateScope();
|
||||
var db = scope.ServiceProvider.GetRequiredService<OtOpcUaConfigDbContext>();
|
||||
|
||||
foreach (var driverId in driverIds)
|
||||
{
|
||||
var driver = driverHost.GetDriver(driverId);
|
||||
if (driver is not IHostConnectivityProbe probe) continue;
|
||||
|
||||
IReadOnlyList<HostConnectivityStatus> statuses;
|
||||
try { statuses = probe.GetHostStatuses(); }
|
||||
catch (Exception ex)
|
||||
{
|
||||
logger.LogWarning(ex, "Driver {DriverId} GetHostStatuses threw — skipping this tick", driverId);
|
||||
continue;
|
||||
}
|
||||
|
||||
foreach (var status in statuses)
|
||||
{
|
||||
await UpsertAsync(db, driverId, status, now, ct);
|
||||
}
|
||||
}
|
||||
|
||||
await db.SaveChangesAsync(ct);
|
||||
}
|
||||
|
||||
private async Task UpsertAsync(OtOpcUaConfigDbContext db, string driverId,
|
||||
HostConnectivityStatus status, DateTime now, CancellationToken ct)
|
||||
{
|
||||
var mapped = MapState(status.State);
|
||||
var existing = await db.DriverHostStatuses.SingleOrDefaultAsync(r =>
|
||||
r.NodeId == nodeOptions.NodeId
|
||||
&& r.DriverInstanceId == driverId
|
||||
&& r.HostName == status.HostName, ct);
|
||||
|
||||
if (existing is null)
|
||||
{
|
||||
db.DriverHostStatuses.Add(new DriverHostStatus
|
||||
{
|
||||
NodeId = nodeOptions.NodeId,
|
||||
DriverInstanceId = driverId,
|
||||
HostName = status.HostName,
|
||||
State = mapped,
|
||||
StateChangedUtc = status.LastChangedUtc,
|
||||
LastSeenUtc = now,
|
||||
});
|
||||
return;
|
||||
}
|
||||
|
||||
existing.LastSeenUtc = now;
|
||||
if (existing.State != mapped)
|
||||
{
|
||||
existing.State = mapped;
|
||||
existing.StateChangedUtc = status.LastChangedUtc;
|
||||
}
|
||||
}
|
||||
|
||||
internal static DriverHostState MapState(HostState state) => state switch
|
||||
{
|
||||
HostState.Running => DriverHostState.Running,
|
||||
HostState.Stopped => DriverHostState.Stopped,
|
||||
HostState.Faulted => DriverHostState.Faulted,
|
||||
_ => DriverHostState.Unknown,
|
||||
};
|
||||
}
|
||||
@@ -3,7 +3,14 @@ using Microsoft.Extensions.Logging;
|
||||
using Opc.Ua;
|
||||
using Opc.Ua.Server;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Resilience;
|
||||
using ZB.MOM.WW.OtOpcUa.Server.Security;
|
||||
using DriverWriteRequest = ZB.MOM.WW.OtOpcUa.Core.Abstractions.WriteRequest;
|
||||
// Core.Abstractions defines a type-named HistoryReadResult (driver-side samples + continuation
|
||||
// point) that collides with Opc.Ua.HistoryReadResult (service-layer per-node result). We
|
||||
// assign driver-side results to an explicitly-aliased local and construct only the service
|
||||
// type in the overrides below.
|
||||
using OpcHistoryReadResult = Opc.Ua.HistoryReadResult;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Server.OpcUa;
|
||||
|
||||
@@ -27,26 +34,39 @@ public sealed class DriverNodeManager : CustomNodeManager2, IAddressSpaceBuilder
|
||||
private readonly IDriver _driver;
|
||||
private readonly IReadable? _readable;
|
||||
private readonly IWritable? _writable;
|
||||
private readonly CapabilityInvoker _invoker;
|
||||
private readonly ILogger<DriverNodeManager> _logger;
|
||||
|
||||
// Per-variable idempotency flag populated during Variable() registration from
|
||||
// DriverAttributeInfo.WriteIdempotent. Drives ExecuteWriteAsync's retry gating in
|
||||
// OnWriteValue; absent entries default to false (decisions #44, #45, #143).
|
||||
private readonly Dictionary<string, bool> _writeIdempotentByFullRef = new(StringComparer.OrdinalIgnoreCase);
|
||||
|
||||
/// <summary>The driver whose address space this node manager exposes.</summary>
|
||||
public IDriver Driver => _driver;
|
||||
|
||||
private FolderState? _driverRoot;
|
||||
private readonly Dictionary<string, BaseDataVariableState> _variablesByFullRef = new(StringComparer.OrdinalIgnoreCase);
|
||||
|
||||
// PR 26: SecurityClassification per variable, populated during Variable() registration.
|
||||
// OnWriteValue looks up the classification here to gate the write by the session's roles.
|
||||
// Drivers never enforce authz themselves — the classification is discovery-time metadata
|
||||
// only (feedback_acl_at_server_layer.md).
|
||||
private readonly Dictionary<string, SecurityClassification> _securityByFullRef = new(StringComparer.OrdinalIgnoreCase);
|
||||
|
||||
// Active building folder — set per Folder() call so Variable() lands under the right parent.
|
||||
// A stack would support nested folders; we use a single current folder because IAddressSpaceBuilder
|
||||
// returns a child builder per Folder call and the caller threads nesting through those references.
|
||||
private FolderState _currentFolder = null!;
|
||||
|
||||
public DriverNodeManager(IServerInternal server, ApplicationConfiguration configuration,
|
||||
IDriver driver, ILogger<DriverNodeManager> logger)
|
||||
IDriver driver, CapabilityInvoker invoker, ILogger<DriverNodeManager> logger)
|
||||
: base(server, configuration, namespaceUris: $"urn:OtOpcUa:{driver.DriverInstanceId}")
|
||||
{
|
||||
_driver = driver;
|
||||
_readable = driver as IReadable;
|
||||
_writable = driver as IWritable;
|
||||
_invoker = invoker;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
@@ -64,7 +84,13 @@ public sealed class DriverNodeManager : CustomNodeManager2, IAddressSpaceBuilder
|
||||
NodeId = new NodeId(_driver.DriverInstanceId, NamespaceIndex),
|
||||
BrowseName = new QualifiedName(_driver.DriverInstanceId, NamespaceIndex),
|
||||
DisplayName = new LocalizedText(_driver.DriverInstanceId),
|
||||
EventNotifier = EventNotifiers.None,
|
||||
// Driver root is the conventional event notifier for HistoryReadEvents — clients
|
||||
// request alarm history by targeting it and the node manager routes through
|
||||
// IHistoryProvider.ReadEventsAsync. SubscribeToEvents is also set so live-event
|
||||
// subscriptions (Alarm & Conditions) can point here in a future PR; today the
|
||||
// alarm events are emitted by per-variable AlarmConditionState siblings but a
|
||||
// "subscribe to all events from this driver" path would use this notifier.
|
||||
EventNotifier = (byte)(EventNotifiers.SubscribeToEvents | EventNotifiers.HistoryRead),
|
||||
};
|
||||
|
||||
// Link under Objects folder so clients see the driver subtree at browse root.
|
||||
@@ -115,13 +141,22 @@ public sealed class DriverNodeManager : CustomNodeManager2, IAddressSpaceBuilder
|
||||
DisplayName = new LocalizedText(displayName),
|
||||
DataType = MapDataType(attributeInfo.DriverDataType),
|
||||
ValueRank = attributeInfo.IsArray ? ValueRanks.OneDimension : ValueRanks.Scalar,
|
||||
AccessLevel = AccessLevels.CurrentReadOrWrite,
|
||||
UserAccessLevel = AccessLevels.CurrentReadOrWrite,
|
||||
// Historized attributes get the HistoryRead access bit so the stack dispatches
|
||||
// incoming HistoryRead service calls to this node. Without it the base class
|
||||
// returns BadHistoryOperationUnsupported before our per-kind hook ever runs.
|
||||
// HistoryWrite isn't granted — history rewrite is a separate capability the
|
||||
// driver doesn't support today.
|
||||
AccessLevel = (byte)(AccessLevels.CurrentReadOrWrite
|
||||
| (attributeInfo.IsHistorized ? AccessLevels.HistoryRead : 0)),
|
||||
UserAccessLevel = (byte)(AccessLevels.CurrentReadOrWrite
|
||||
| (attributeInfo.IsHistorized ? AccessLevels.HistoryRead : 0)),
|
||||
Historizing = attributeInfo.IsHistorized,
|
||||
};
|
||||
_currentFolder.AddChild(v);
|
||||
AddPredefinedNode(SystemContext, v);
|
||||
_variablesByFullRef[attributeInfo.FullName] = v;
|
||||
_securityByFullRef[attributeInfo.FullName] = attributeInfo.SecurityClass;
|
||||
_writeIdempotentByFullRef[attributeInfo.FullName] = attributeInfo.WriteIdempotent;
|
||||
|
||||
v.OnReadValue = OnReadValue;
|
||||
v.OnWriteValue = OnWriteValue;
|
||||
@@ -162,7 +197,11 @@ public sealed class DriverNodeManager : CustomNodeManager2, IAddressSpaceBuilder
|
||||
try
|
||||
{
|
||||
var fullRef = node.NodeId.Identifier as string ?? "";
|
||||
var result = _readable.ReadAsync([fullRef], CancellationToken.None).GetAwaiter().GetResult();
|
||||
var result = _invoker.ExecuteAsync(
|
||||
DriverCapability.Read,
|
||||
_driver.DriverInstanceId,
|
||||
async ct => (IReadOnlyList<DataValueSnapshot>)await _readable.ReadAsync([fullRef], ct).ConfigureAwait(false),
|
||||
CancellationToken.None).AsTask().GetAwaiter().GetResult();
|
||||
if (result.Count == 0)
|
||||
{
|
||||
statusCode = StatusCodes.BadNoData;
|
||||
@@ -337,11 +376,33 @@ public sealed class DriverNodeManager : CustomNodeManager2, IAddressSpaceBuilder
|
||||
var fullRef = node.NodeId.Identifier as string;
|
||||
if (string.IsNullOrEmpty(fullRef)) return StatusCodes.BadNodeIdUnknown;
|
||||
|
||||
// PR 26: server-layer write authorization. Look up the attribute's classification
|
||||
// (populated during Variable() in Discover) and check the session's roles against the
|
||||
// policy table. Drivers don't participate in this decision — IWritable.WriteAsync
|
||||
// never sees a request we'd have refused here.
|
||||
if (_securityByFullRef.TryGetValue(fullRef!, out var classification))
|
||||
{
|
||||
var roles = context.UserIdentity is IRoleBearer rb ? rb.Roles : [];
|
||||
if (!WriteAuthzPolicy.IsAllowed(classification, roles))
|
||||
{
|
||||
_logger.LogInformation(
|
||||
"Write denied for {FullRef}: classification={Classification} userRoles=[{Roles}]",
|
||||
fullRef, classification, string.Join(",", roles));
|
||||
return new ServiceResult(StatusCodes.BadUserAccessDenied);
|
||||
}
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
var results = _writable.WriteAsync(
|
||||
[new DriverWriteRequest(fullRef!, value)],
|
||||
CancellationToken.None).GetAwaiter().GetResult();
|
||||
var isIdempotent = _writeIdempotentByFullRef.GetValueOrDefault(fullRef!, false);
|
||||
var capturedValue = value;
|
||||
var results = _invoker.ExecuteWriteAsync(
|
||||
_driver.DriverInstanceId,
|
||||
isIdempotent,
|
||||
async ct => (IReadOnlyList<WriteResult>)await _writable.WriteAsync(
|
||||
[new DriverWriteRequest(fullRef!, capturedValue)],
|
||||
ct).ConfigureAwait(false),
|
||||
CancellationToken.None).AsTask().GetAwaiter().GetResult();
|
||||
if (results.Count > 0 && results[0].StatusCode != 0)
|
||||
{
|
||||
statusCode = results[0].StatusCode;
|
||||
@@ -360,4 +421,394 @@ public sealed class DriverNodeManager : CustomNodeManager2, IAddressSpaceBuilder
|
||||
internal int VariableCount => _variablesByFullRef.Count;
|
||||
internal bool TryGetVariable(string fullRef, out BaseDataVariableState? v)
|
||||
=> _variablesByFullRef.TryGetValue(fullRef, out v!);
|
||||
|
||||
// ===================== HistoryRead service handlers (LMX #1, PR 38) =====================
|
||||
//
|
||||
// Wires the driver's IHistoryProvider capability (PR 35 added ReadAtTimeAsync / ReadEventsAsync
|
||||
// alongside the PR 19 ReadRawAsync / ReadProcessedAsync) to the OPC UA HistoryRead service.
|
||||
// CustomNodeManager2 has four protected per-kind hooks; the base dispatches to the right one
|
||||
// based on the concrete HistoryReadDetails subtype. Each hook is sync-returning-void — the
|
||||
// per-driver async calls are bridged via GetAwaiter().GetResult(), matching the pattern
|
||||
// OnReadValue / OnWriteValue already use in this class so HistoryRead doesn't introduce a
|
||||
// different sync-over-async convention.
|
||||
//
|
||||
// Per-node routing: every HistoryReadValueId in nodesToRead has a NodeHandle in
|
||||
// nodesToProcess; the NodeHandle's NodeId.Identifier is the driver-side full reference
|
||||
// (set during Variable() registration) so we can dispatch straight to IHistoryProvider
|
||||
// without a second lookup. Nodes without IHistoryProvider backing (drivers that don't
|
||||
// implement the capability) surface BadHistoryOperationUnsupported per slot and the
|
||||
// rest of the batch continues — same failure-isolation pattern as OnWriteValue.
|
||||
//
|
||||
// Continuation-point handling is pass-through only in this PR: the driver returns null
|
||||
// from its ContinuationPoint field today so the outer result's ContinuationPoint stays
|
||||
// empty. Full Session.SaveHistoryContinuationPoint plumbing is a follow-up when a driver
|
||||
// actually needs paging — the dispatch shape doesn't change, only the result-population.
|
||||
|
||||
private IHistoryProvider? History => _driver as IHistoryProvider;
|
||||
|
||||
protected override void HistoryReadRawModified(
|
||||
ServerSystemContext context, ReadRawModifiedDetails details, TimestampsToReturn timestamps,
|
||||
IList<HistoryReadValueId> nodesToRead, IList<OpcHistoryReadResult> results,
|
||||
IList<ServiceResult> errors, List<NodeHandle> nodesToProcess,
|
||||
IDictionary<NodeId, NodeState> cache)
|
||||
{
|
||||
if (History is null)
|
||||
{
|
||||
MarkAllUnsupported(nodesToProcess, results, errors);
|
||||
return;
|
||||
}
|
||||
|
||||
// IsReadModified=true requests a "modifications" history (who changed the data, when
|
||||
// it was re-written). The driver side has no modifications store — surface that
|
||||
// explicitly rather than silently returning raw data, which would mislead the client.
|
||||
if (details.IsReadModified)
|
||||
{
|
||||
MarkAllUnsupported(nodesToProcess, results, errors, StatusCodes.BadHistoryOperationUnsupported);
|
||||
return;
|
||||
}
|
||||
|
||||
for (var n = 0; n < nodesToProcess.Count; n++)
|
||||
{
|
||||
var handle = nodesToProcess[n];
|
||||
// NodeHandle.Index points back to the slot in the outer results/errors/nodesToRead
|
||||
// arrays. nodesToProcess is the filtered subset (just the nodes this manager
|
||||
// claimed), so writing to results[n] lands in the wrong slot when N > 1 and nodes
|
||||
// are interleaved across multiple node managers.
|
||||
var i = handle.Index;
|
||||
var fullRef = ResolveFullRef(handle);
|
||||
if (fullRef is null)
|
||||
{
|
||||
WriteNodeIdUnknown(results, errors, i);
|
||||
continue;
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
var driverResult = _invoker.ExecuteAsync(
|
||||
DriverCapability.HistoryRead,
|
||||
_driver.DriverInstanceId,
|
||||
async ct => await History.ReadRawAsync(
|
||||
fullRef,
|
||||
details.StartTime,
|
||||
details.EndTime,
|
||||
details.NumValuesPerNode,
|
||||
ct).ConfigureAwait(false),
|
||||
CancellationToken.None).AsTask().GetAwaiter().GetResult();
|
||||
|
||||
WriteResult(results, errors, i, StatusCodes.Good,
|
||||
BuildHistoryData(driverResult.Samples), driverResult.ContinuationPoint);
|
||||
}
|
||||
catch (NotSupportedException)
|
||||
{
|
||||
WriteUnsupported(results, errors, i);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(ex, "HistoryReadRaw failed for {FullRef}", fullRef);
|
||||
WriteInternalError(results, errors, i);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
protected override void HistoryReadProcessed(
|
||||
ServerSystemContext context, ReadProcessedDetails details, TimestampsToReturn timestamps,
|
||||
IList<HistoryReadValueId> nodesToRead, IList<OpcHistoryReadResult> results,
|
||||
IList<ServiceResult> errors, List<NodeHandle> nodesToProcess,
|
||||
IDictionary<NodeId, NodeState> cache)
|
||||
{
|
||||
if (History is null)
|
||||
{
|
||||
MarkAllUnsupported(nodesToProcess, results, errors);
|
||||
return;
|
||||
}
|
||||
|
||||
// AggregateType is one NodeId shared across every item in the batch — map once.
|
||||
var aggregate = MapAggregate(details.AggregateType?.FirstOrDefault());
|
||||
if (aggregate is null)
|
||||
{
|
||||
MarkAllUnsupported(nodesToProcess, results, errors, StatusCodes.BadAggregateNotSupported);
|
||||
return;
|
||||
}
|
||||
|
||||
var interval = TimeSpan.FromMilliseconds(details.ProcessingInterval);
|
||||
for (var n = 0; n < nodesToProcess.Count; n++)
|
||||
{
|
||||
var handle = nodesToProcess[n];
|
||||
// NodeHandle.Index points back to the slot in the outer results/errors/nodesToRead
|
||||
// arrays. nodesToProcess is the filtered subset (just the nodes this manager
|
||||
// claimed), so writing to results[n] lands in the wrong slot when N > 1 and nodes
|
||||
// are interleaved across multiple node managers.
|
||||
var i = handle.Index;
|
||||
var fullRef = ResolveFullRef(handle);
|
||||
if (fullRef is null)
|
||||
{
|
||||
WriteNodeIdUnknown(results, errors, i);
|
||||
continue;
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
var driverResult = _invoker.ExecuteAsync(
|
||||
DriverCapability.HistoryRead,
|
||||
_driver.DriverInstanceId,
|
||||
async ct => await History.ReadProcessedAsync(
|
||||
fullRef,
|
||||
details.StartTime,
|
||||
details.EndTime,
|
||||
interval,
|
||||
aggregate.Value,
|
||||
ct).ConfigureAwait(false),
|
||||
CancellationToken.None).AsTask().GetAwaiter().GetResult();
|
||||
|
||||
WriteResult(results, errors, i, StatusCodes.Good,
|
||||
BuildHistoryData(driverResult.Samples), driverResult.ContinuationPoint);
|
||||
}
|
||||
catch (NotSupportedException)
|
||||
{
|
||||
WriteUnsupported(results, errors, i);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(ex, "HistoryReadProcessed failed for {FullRef}", fullRef);
|
||||
WriteInternalError(results, errors, i);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
protected override void HistoryReadAtTime(
|
||||
ServerSystemContext context, ReadAtTimeDetails details, TimestampsToReturn timestamps,
|
||||
IList<HistoryReadValueId> nodesToRead, IList<OpcHistoryReadResult> results,
|
||||
IList<ServiceResult> errors, List<NodeHandle> nodesToProcess,
|
||||
IDictionary<NodeId, NodeState> cache)
|
||||
{
|
||||
if (History is null)
|
||||
{
|
||||
MarkAllUnsupported(nodesToProcess, results, errors);
|
||||
return;
|
||||
}
|
||||
|
||||
var requestedTimes = (IReadOnlyList<DateTime>)(details.ReqTimes?.ToArray() ?? Array.Empty<DateTime>());
|
||||
for (var n = 0; n < nodesToProcess.Count; n++)
|
||||
{
|
||||
var handle = nodesToProcess[n];
|
||||
// NodeHandle.Index points back to the slot in the outer results/errors/nodesToRead
|
||||
// arrays. nodesToProcess is the filtered subset (just the nodes this manager
|
||||
// claimed), so writing to results[n] lands in the wrong slot when N > 1 and nodes
|
||||
// are interleaved across multiple node managers.
|
||||
var i = handle.Index;
|
||||
var fullRef = ResolveFullRef(handle);
|
||||
if (fullRef is null)
|
||||
{
|
||||
WriteNodeIdUnknown(results, errors, i);
|
||||
continue;
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
var driverResult = _invoker.ExecuteAsync(
|
||||
DriverCapability.HistoryRead,
|
||||
_driver.DriverInstanceId,
|
||||
async ct => await History.ReadAtTimeAsync(fullRef, requestedTimes, ct).ConfigureAwait(false),
|
||||
CancellationToken.None).AsTask().GetAwaiter().GetResult();
|
||||
|
||||
WriteResult(results, errors, i, StatusCodes.Good,
|
||||
BuildHistoryData(driverResult.Samples), driverResult.ContinuationPoint);
|
||||
}
|
||||
catch (NotSupportedException)
|
||||
{
|
||||
WriteUnsupported(results, errors, i);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(ex, "HistoryReadAtTime failed for {FullRef}", fullRef);
|
||||
WriteInternalError(results, errors, i);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
protected override void HistoryReadEvents(
|
||||
ServerSystemContext context, ReadEventDetails details, TimestampsToReturn timestamps,
|
||||
IList<HistoryReadValueId> nodesToRead, IList<OpcHistoryReadResult> results,
|
||||
IList<ServiceResult> errors, List<NodeHandle> nodesToProcess,
|
||||
IDictionary<NodeId, NodeState> cache)
|
||||
{
|
||||
if (History is null)
|
||||
{
|
||||
MarkAllUnsupported(nodesToProcess, results, errors);
|
||||
return;
|
||||
}
|
||||
|
||||
// SourceName filter extraction is deferred — EventFilter SelectClauses + WhereClause
|
||||
// handling is a dedicated concern (proper per-select-clause Variant population + where
|
||||
// filter evaluation). This PR treats the event query as "all events in range for the
|
||||
// node's source" and populates only the standard BaseEventType fields. Richer filter
|
||||
// handling is a follow-up; clients issuing empty/default filters get the right answer
|
||||
// today which covers the common alarm-history browse case.
|
||||
var maxEvents = (int)details.NumValuesPerNode;
|
||||
if (maxEvents <= 0) maxEvents = 1000;
|
||||
|
||||
for (var n = 0; n < nodesToProcess.Count; n++)
|
||||
{
|
||||
var handle = nodesToProcess[n];
|
||||
// NodeHandle.Index points back to the slot in the outer results/errors/nodesToRead
|
||||
// arrays. nodesToProcess is the filtered subset (just the nodes this manager
|
||||
// claimed), so writing to results[n] lands in the wrong slot when N > 1 and nodes
|
||||
// are interleaved across multiple node managers.
|
||||
var i = handle.Index;
|
||||
// Event history queries may target a notifier object (e.g. the driver-root folder)
|
||||
// rather than a specific variable — in that case we pass sourceName=null to mean
|
||||
// "all sources in the driver's namespace" per the IHistoryProvider contract.
|
||||
var fullRef = ResolveFullRef(handle);
|
||||
|
||||
try
|
||||
{
|
||||
var driverResult = _invoker.ExecuteAsync(
|
||||
DriverCapability.HistoryRead,
|
||||
_driver.DriverInstanceId,
|
||||
async ct => await History.ReadEventsAsync(
|
||||
sourceName: fullRef,
|
||||
startUtc: details.StartTime,
|
||||
endUtc: details.EndTime,
|
||||
maxEvents: maxEvents,
|
||||
cancellationToken: ct).ConfigureAwait(false),
|
||||
CancellationToken.None).AsTask().GetAwaiter().GetResult();
|
||||
|
||||
WriteResult(results, errors, i, StatusCodes.Good,
|
||||
BuildHistoryEvent(driverResult.Events), driverResult.ContinuationPoint);
|
||||
}
|
||||
catch (NotSupportedException)
|
||||
{
|
||||
WriteUnsupported(results, errors, i);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(ex, "HistoryReadEvents failed for {FullRef}", fullRef);
|
||||
WriteInternalError(results, errors, i);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private string? ResolveFullRef(NodeHandle handle) => handle.NodeId?.Identifier as string;
|
||||
|
||||
// Both the results list AND the parallel errors list must be populated — MasterNodeManager
|
||||
// merges them and the merged StatusCode is what the client sees. Leaving errors[i] at its
|
||||
// default (BadHistoryOperationUnsupported) overrides a Good result with Unsupported, which
|
||||
// masks a correctly-constructed HistoryData response. This was the subtle failure mode
|
||||
// that cost most of PR 38's debugging budget.
|
||||
private static void WriteResult(IList<OpcHistoryReadResult> results, IList<ServiceResult> errors,
|
||||
int i, uint statusCode, ExtensionObject historyData, byte[]? continuationPoint)
|
||||
{
|
||||
results[i] = new OpcHistoryReadResult
|
||||
{
|
||||
StatusCode = statusCode,
|
||||
HistoryData = historyData,
|
||||
ContinuationPoint = continuationPoint,
|
||||
};
|
||||
errors[i] = statusCode == StatusCodes.Good
|
||||
? ServiceResult.Good
|
||||
: new ServiceResult(statusCode);
|
||||
}
|
||||
|
||||
private static void WriteUnsupported(IList<OpcHistoryReadResult> results, IList<ServiceResult> errors, int i)
|
||||
{
|
||||
results[i] = new OpcHistoryReadResult { StatusCode = StatusCodes.BadHistoryOperationUnsupported };
|
||||
errors[i] = StatusCodes.BadHistoryOperationUnsupported;
|
||||
}
|
||||
|
||||
private static void WriteInternalError(IList<OpcHistoryReadResult> results, IList<ServiceResult> errors, int i)
|
||||
{
|
||||
results[i] = new OpcHistoryReadResult { StatusCode = StatusCodes.BadInternalError };
|
||||
errors[i] = StatusCodes.BadInternalError;
|
||||
}
|
||||
|
||||
private static void WriteNodeIdUnknown(IList<OpcHistoryReadResult> results, IList<ServiceResult> errors, int i)
|
||||
{
|
||||
WriteNodeIdUnknown(results, errors, i);
|
||||
errors[i] = StatusCodes.BadNodeIdUnknown;
|
||||
}
|
||||
|
||||
private static void MarkAllUnsupported(
|
||||
List<NodeHandle> nodes, IList<OpcHistoryReadResult> results, IList<ServiceResult> errors,
|
||||
uint statusCode = StatusCodes.BadHistoryOperationUnsupported)
|
||||
{
|
||||
foreach (var handle in nodes)
|
||||
{
|
||||
results[handle.Index] = new OpcHistoryReadResult { StatusCode = statusCode };
|
||||
errors[handle.Index] = statusCode == StatusCodes.Good ? ServiceResult.Good : new ServiceResult(statusCode);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Map the OPC UA Part 13 aggregate-function NodeId to the driver's
|
||||
/// <see cref="HistoryAggregateType"/>. Internal so the test suite can pin the mapping
|
||||
/// without exposing public API. Returns null for unsupported aggregates so the service
|
||||
/// handler can surface <c>BadAggregateNotSupported</c> on the whole batch.
|
||||
/// </summary>
|
||||
internal static HistoryAggregateType? MapAggregate(NodeId? aggregateNodeId)
|
||||
{
|
||||
if (aggregateNodeId is null) return null;
|
||||
|
||||
// Every AggregateFunction_* identifier is a numeric uint on the Server (0) namespace.
|
||||
// Comparing NodeIds by value handles all the cross-encoding cases (expanded vs plain).
|
||||
if (aggregateNodeId == ObjectIds.AggregateFunction_Average) return HistoryAggregateType.Average;
|
||||
if (aggregateNodeId == ObjectIds.AggregateFunction_Minimum) return HistoryAggregateType.Minimum;
|
||||
if (aggregateNodeId == ObjectIds.AggregateFunction_Maximum) return HistoryAggregateType.Maximum;
|
||||
if (aggregateNodeId == ObjectIds.AggregateFunction_Total) return HistoryAggregateType.Total;
|
||||
if (aggregateNodeId == ObjectIds.AggregateFunction_Count) return HistoryAggregateType.Count;
|
||||
return null;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Wrap driver samples as <c>HistoryData</c> in an <c>ExtensionObject</c> — the on-wire
|
||||
/// shape the OPC UA HistoryRead service expects for raw / processed / at-time reads.
|
||||
/// </summary>
|
||||
internal static ExtensionObject BuildHistoryData(IReadOnlyList<DataValueSnapshot> samples)
|
||||
{
|
||||
var values = new DataValueCollection(samples.Count);
|
||||
foreach (var s in samples) values.Add(ToDataValue(s));
|
||||
return new ExtensionObject(new HistoryData { DataValues = values });
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Wrap driver events as <c>HistoryEvent</c> in an <c>ExtensionObject</c>. Populates
|
||||
/// the minimum BaseEventType field set (SourceName, Message, Severity, Time,
|
||||
/// ReceiveTime, EventId) so clients that request the default
|
||||
/// <c>SimpleAttributeOperand</c> select-clauses see useful data. Custom EventFilter
|
||||
/// SelectClause evaluation is deferred — when a client sends a specific operand list,
|
||||
/// they currently get the standard fields back and ignore the extras. Documented on the
|
||||
/// public follow-up list.
|
||||
/// </summary>
|
||||
internal static ExtensionObject BuildHistoryEvent(IReadOnlyList<HistoricalEvent> events)
|
||||
{
|
||||
var fieldLists = new HistoryEventFieldListCollection(events.Count);
|
||||
foreach (var e in events)
|
||||
{
|
||||
var fields = new VariantCollection
|
||||
{
|
||||
// Order must match BaseEventType's conventional field ordering so clients that
|
||||
// didn't customize the SelectClauses still see recognizable columns. A future
|
||||
// PR that respects the client's SelectClause list will drive this from the filter.
|
||||
new Variant(e.EventId),
|
||||
new Variant(e.SourceName ?? string.Empty),
|
||||
new Variant(new LocalizedText(e.Message ?? string.Empty)),
|
||||
new Variant(e.Severity),
|
||||
new Variant(e.EventTimeUtc),
|
||||
new Variant(e.ReceivedTimeUtc),
|
||||
};
|
||||
fieldLists.Add(new HistoryEventFieldList { EventFields = fields });
|
||||
}
|
||||
return new ExtensionObject(new HistoryEvent { Events = fieldLists });
|
||||
}
|
||||
|
||||
internal static DataValue ToDataValue(DataValueSnapshot s)
|
||||
{
|
||||
var dv = new DataValue
|
||||
{
|
||||
Value = s.Value,
|
||||
StatusCode = new StatusCode(s.StatusCode),
|
||||
ServerTimestamp = s.ServerTimestampUtc,
|
||||
};
|
||||
if (s.SourceTimestampUtc.HasValue) dv.SourceTimestamp = s.SourceTimestampUtc.Value;
|
||||
return dv;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,6 +3,7 @@ using Opc.Ua;
|
||||
using Opc.Ua.Configuration;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Hosting;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.OpcUa;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Resilience;
|
||||
using ZB.MOM.WW.OtOpcUa.Server.Security;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Server.OpcUa;
|
||||
@@ -20,6 +21,7 @@ public sealed class OpcUaApplicationHost : IAsyncDisposable
|
||||
private readonly OpcUaServerOptions _options;
|
||||
private readonly DriverHost _driverHost;
|
||||
private readonly IUserAuthenticator _authenticator;
|
||||
private readonly DriverResiliencePipelineBuilder _pipelineBuilder;
|
||||
private readonly ILoggerFactory _loggerFactory;
|
||||
private readonly ILogger<OpcUaApplicationHost> _logger;
|
||||
private ApplicationInstance? _application;
|
||||
@@ -27,11 +29,13 @@ public sealed class OpcUaApplicationHost : IAsyncDisposable
|
||||
private bool _disposed;
|
||||
|
||||
public OpcUaApplicationHost(OpcUaServerOptions options, DriverHost driverHost,
|
||||
IUserAuthenticator authenticator, ILoggerFactory loggerFactory, ILogger<OpcUaApplicationHost> logger)
|
||||
IUserAuthenticator authenticator, ILoggerFactory loggerFactory, ILogger<OpcUaApplicationHost> logger,
|
||||
DriverResiliencePipelineBuilder? pipelineBuilder = null)
|
||||
{
|
||||
_options = options;
|
||||
_driverHost = driverHost;
|
||||
_authenticator = authenticator;
|
||||
_pipelineBuilder = pipelineBuilder ?? new DriverResiliencePipelineBuilder();
|
||||
_loggerFactory = loggerFactory;
|
||||
_logger = logger;
|
||||
}
|
||||
@@ -58,7 +62,7 @@ public sealed class OpcUaApplicationHost : IAsyncDisposable
|
||||
throw new InvalidOperationException(
|
||||
$"OPC UA application certificate could not be validated or created in {_options.PkiStoreRoot}");
|
||||
|
||||
_server = new OtOpcUaServer(_driverHost, _authenticator, _loggerFactory);
|
||||
_server = new OtOpcUaServer(_driverHost, _authenticator, _pipelineBuilder, _loggerFactory);
|
||||
await _application.Start(_server).ConfigureAwait(false);
|
||||
|
||||
_logger.LogInformation("OPC UA server started — endpoint={Endpoint} driverCount={Count}",
|
||||
|
||||
@@ -5,6 +5,7 @@ using Opc.Ua.Server;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Hosting;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.OpcUa;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Resilience;
|
||||
using ZB.MOM.WW.OtOpcUa.Server.Security;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Server.OpcUa;
|
||||
@@ -19,13 +20,19 @@ public sealed class OtOpcUaServer : StandardServer
|
||||
{
|
||||
private readonly DriverHost _driverHost;
|
||||
private readonly IUserAuthenticator _authenticator;
|
||||
private readonly DriverResiliencePipelineBuilder _pipelineBuilder;
|
||||
private readonly ILoggerFactory _loggerFactory;
|
||||
private readonly List<DriverNodeManager> _driverNodeManagers = new();
|
||||
|
||||
public OtOpcUaServer(DriverHost driverHost, IUserAuthenticator authenticator, ILoggerFactory loggerFactory)
|
||||
public OtOpcUaServer(
|
||||
DriverHost driverHost,
|
||||
IUserAuthenticator authenticator,
|
||||
DriverResiliencePipelineBuilder pipelineBuilder,
|
||||
ILoggerFactory loggerFactory)
|
||||
{
|
||||
_driverHost = driverHost;
|
||||
_authenticator = authenticator;
|
||||
_pipelineBuilder = pipelineBuilder;
|
||||
_loggerFactory = loggerFactory;
|
||||
}
|
||||
|
||||
@@ -46,7 +53,12 @@ public sealed class OtOpcUaServer : StandardServer
|
||||
if (driver is null) continue;
|
||||
|
||||
var logger = _loggerFactory.CreateLogger<DriverNodeManager>();
|
||||
var manager = new DriverNodeManager(server, configuration, driver, logger);
|
||||
// Per-driver resilience options: default Tier A pending Stream B.1 which wires
|
||||
// per-type tiers into DriverTypeRegistry. Read ResilienceConfig JSON from the
|
||||
// DriverInstance row in a follow-up PR; for now every driver gets Tier A defaults.
|
||||
var options = new DriverResilienceOptions { Tier = DriverTier.A };
|
||||
var invoker = new CapabilityInvoker(_pipelineBuilder, driver.DriverInstanceId, () => options);
|
||||
var manager = new DriverNodeManager(server, configuration, driver, invoker, logger);
|
||||
_driverNodeManagers.Add(manager);
|
||||
}
|
||||
|
||||
@@ -97,7 +109,7 @@ public sealed class OtOpcUaServer : StandardServer
|
||||
/// managers can gate writes by role via <c>session.Identity</c>. Anonymous identity still
|
||||
/// uses the stack's default.
|
||||
/// </summary>
|
||||
private sealed class RoleBasedIdentity : UserIdentity
|
||||
private sealed class RoleBasedIdentity : UserIdentity, IRoleBearer
|
||||
{
|
||||
public IReadOnlyList<string> Roles { get; }
|
||||
public string? Display { get; }
|
||||
|
||||
@@ -1,8 +1,10 @@
|
||||
using Microsoft.EntityFrameworkCore;
|
||||
using Microsoft.Extensions.Configuration;
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using Microsoft.Extensions.Hosting;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Serilog;
|
||||
using ZB.MOM.WW.OtOpcUa.Configuration;
|
||||
using ZB.MOM.WW.OtOpcUa.Configuration.LocalCache;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Hosting;
|
||||
using ZB.MOM.WW.OtOpcUa.Server;
|
||||
@@ -72,5 +74,11 @@ builder.Services.AddSingleton<NodeBootstrap>();
|
||||
builder.Services.AddSingleton<OpcUaApplicationHost>();
|
||||
builder.Services.AddHostedService<OpcUaServerService>();
|
||||
|
||||
// Central-config DB access for the host-status publisher (LMX follow-up #7). Scoped context
|
||||
// so per-heartbeat change-tracking stays isolated; publisher opens one scope per tick.
|
||||
builder.Services.AddDbContext<OtOpcUaConfigDbContext>(opt =>
|
||||
opt.UseSqlServer(options.ConfigDbConnectionString));
|
||||
builder.Services.AddHostedService<HostStatusPublisher>();
|
||||
|
||||
var host = builder.Build();
|
||||
await host.RunAsync();
|
||||
|
||||
13
src/ZB.MOM.WW.OtOpcUa.Server/Security/IRoleBearer.cs
Normal file
13
src/ZB.MOM.WW.OtOpcUa.Server/Security/IRoleBearer.cs
Normal file
@@ -0,0 +1,13 @@
|
||||
namespace ZB.MOM.WW.OtOpcUa.Server.Security;
|
||||
|
||||
/// <summary>
|
||||
/// Minimal interface a <see cref="Opc.Ua.IUserIdentity"/> implementation can expose so
|
||||
/// <see cref="ZB.MOM.WW.OtOpcUa.Server.OpcUa.DriverNodeManager"/> can read the session's
|
||||
/// resolved roles without a hard dependency on any specific identity subtype. Implemented
|
||||
/// by <c>OtOpcUaServer.RoleBasedIdentity</c>; tests implement it with stub identities to
|
||||
/// drive the authz policy under different role sets.
|
||||
/// </summary>
|
||||
public interface IRoleBearer
|
||||
{
|
||||
IReadOnlyList<string> Roles { get; }
|
||||
}
|
||||
@@ -2,11 +2,37 @@ namespace ZB.MOM.WW.OtOpcUa.Server.Security;
|
||||
|
||||
/// <summary>
|
||||
/// LDAP settings for the OPC UA server's UserName token validator. Bound from
|
||||
/// <c>appsettings.json</c> <c>OpcUaServer:Ldap</c>. Defaults match the GLAuth dev instance
|
||||
/// (localhost:3893, dc=lmxopcua,dc=local). Production deployments set <see cref="UseTls"/>
|
||||
/// true, populate <see cref="ServiceAccountDn"/> for search-then-bind, and maintain
|
||||
/// <see cref="GroupToRole"/> with the real LDAP group names.
|
||||
/// <c>appsettings.json</c> <c>OpcUaServer:Ldap</c>. Defaults target the GLAuth dev instance
|
||||
/// (localhost:3893, <c>dc=lmxopcua,dc=local</c>) for the stock inner-loop setup. Production
|
||||
/// deployments are expected to point at Active Directory; see <see cref="UserNameAttribute"/>
|
||||
/// and the per-field xml-docs for the AD-specific overrides.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para><b>Active Directory cheat-sheet</b>:</para>
|
||||
/// <list type="bullet">
|
||||
/// <item><see cref="Server"/>: one of the domain controllers, or the domain FQDN (will round-robin DCs).</item>
|
||||
/// <item><see cref="Port"/>: <c>389</c> (LDAP) or <c>636</c> (LDAPS); use 636 + <see cref="UseTls"/> in production.</item>
|
||||
/// <item><see cref="UseTls"/>: <c>true</c>. AD increasingly rejects plain-LDAP bind under LDAP-signing enforcement.</item>
|
||||
/// <item><see cref="AllowInsecureLdap"/>: <c>false</c>. Dev escape hatch only.</item>
|
||||
/// <item><see cref="SearchBase"/>: <c>DC=corp,DC=example,DC=com</c> — your domain's base DN.</item>
|
||||
/// <item><see cref="ServiceAccountDn"/>: a dedicated service principal with read access to user + group entries
|
||||
/// (e.g. <c>CN=OpcUaSvc,OU=Service Accounts,DC=corp,DC=example,DC=com</c>). Never a privileged admin.</item>
|
||||
/// <item><see cref="UserNameAttribute"/>: <c>sAMAccountName</c> (classic login name) or <c>userPrincipalName</c>
|
||||
/// (user@domain form). Default is <c>uid</c> which AD does <b>not</b> populate, so this override is required.</item>
|
||||
/// <item><see cref="DisplayNameAttribute"/>: <c>displayName</c> gives the human name; <c>cn</c> works too but is less rich.</item>
|
||||
/// <item><see cref="GroupAttribute"/>: <c>memberOf</c> — matches AD's default. Values are full DNs
|
||||
/// (<c>CN=<Group>,OU=...,DC=...</c>); the authenticator strips the leading <c>CN=</c> RDN value and uses
|
||||
/// that as the lookup key in <see cref="GroupToRole"/>.</item>
|
||||
/// <item><see cref="GroupToRole"/>: maps your AD group common-names to OPC UA roles — e.g.
|
||||
/// <c>{"OPCUA-Operators" : "WriteOperate", "OPCUA-Engineers" : "WriteConfigure"}</c>.</item>
|
||||
/// </list>
|
||||
/// <para>
|
||||
/// Nested groups are <b>not</b> expanded — AD's <c>tokenGroups</c> / <c>LDAP_MATCHING_RULE_IN_CHAIN</c>
|
||||
/// membership-chain filter isn't used. Assign users directly to the role-mapped groups, or pre-flatten
|
||||
/// membership in your directory. If nested expansion becomes a requirement, it's an authenticator
|
||||
/// enhancement (not a config change).
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public sealed class LdapOptions
|
||||
{
|
||||
public bool Enabled { get; init; } = false;
|
||||
@@ -23,6 +49,20 @@ public sealed class LdapOptions
|
||||
public string DisplayNameAttribute { get; init; } = "cn";
|
||||
public string GroupAttribute { get; init; } = "memberOf";
|
||||
|
||||
/// <summary>
|
||||
/// LDAP attribute used to match a login name against user entries in the directory.
|
||||
/// Defaults to <c>uid</c> (RFC 2307). Common overrides:
|
||||
/// <list type="bullet">
|
||||
/// <item><c>sAMAccountName</c> — Active Directory, classic NT-style login names (e.g. <c>jdoe</c>).</item>
|
||||
/// <item><c>userPrincipalName</c> — Active Directory, email-style (e.g. <c>jdoe@corp.example.com</c>).</item>
|
||||
/// <item><c>cn</c> — GLAuth + some OpenLDAP deployments where users are keyed by common-name.</item>
|
||||
/// </list>
|
||||
/// Used only when <see cref="ServiceAccountDn"/> is non-empty (search-then-bind path) —
|
||||
/// direct-bind fallback constructs the DN as <c>cn=<name>,<SearchBase></c>
|
||||
/// regardless of this setting and is not a production-grade path against AD.
|
||||
/// </summary>
|
||||
public string UserNameAttribute { get; init; } = "uid";
|
||||
|
||||
/// <summary>
|
||||
/// LDAP group → OPC UA role. Each authenticated user gets every role whose source group
|
||||
/// is in their membership list. Recognized role names (CLAUDE.md): <c>ReadOnly</c> (browse
|
||||
|
||||
@@ -106,7 +106,7 @@ public sealed class LdapUserAuthenticator(LdapOptions options, ILogger<LdapUserA
|
||||
{
|
||||
await Task.Run(() => conn.Bind(options.ServiceAccountDn, options.ServiceAccountPassword), ct);
|
||||
|
||||
var filter = $"(uid={EscapeLdapFilter(username)})";
|
||||
var filter = $"({options.UserNameAttribute}={EscapeLdapFilter(username)})";
|
||||
var results = await Task.Run(() =>
|
||||
conn.Search(options.SearchBase, LdapConnection.ScopeSub, filter, ["dn"], false), ct);
|
||||
|
||||
|
||||
70
src/ZB.MOM.WW.OtOpcUa.Server/Security/WriteAuthzPolicy.cs
Normal file
70
src/ZB.MOM.WW.OtOpcUa.Server/Security/WriteAuthzPolicy.cs
Normal file
@@ -0,0 +1,70 @@
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Server.Security;
|
||||
|
||||
/// <summary>
|
||||
/// Server-layer write-authorization policy. ACL enforcement lives here — drivers report
|
||||
/// <see cref="SecurityClassification"/> as discovery metadata only; the server decides
|
||||
/// whether a given session is allowed to write a given attribute by checking the session's
|
||||
/// roles (resolved at login via <see cref="LdapUserAuthenticator"/>) against the required
|
||||
/// role for the attribute's classification.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// Matches the table in <c>docs/Configuration.md</c>:
|
||||
/// <list type="bullet">
|
||||
/// <item><c>FreeAccess</c>: no role required — anonymous sessions can write (matches v1 default).</item>
|
||||
/// <item><c>Operate</c> / <c>SecuredWrite</c>: <c>WriteOperate</c> role required.</item>
|
||||
/// <item><c>Tune</c>: <c>WriteTune</c> role required.</item>
|
||||
/// <item><c>VerifiedWrite</c> / <c>Configure</c>: <c>WriteConfigure</c> role required.</item>
|
||||
/// <item><c>ViewOnly</c>: no role grants write access.</item>
|
||||
/// </list>
|
||||
/// <c>AlarmAck</c> is checked at the alarm-acknowledge path, not here.
|
||||
/// </remarks>
|
||||
public static class WriteAuthzPolicy
|
||||
{
|
||||
public const string RoleWriteOperate = "WriteOperate";
|
||||
public const string RoleWriteTune = "WriteTune";
|
||||
public const string RoleWriteConfigure = "WriteConfigure";
|
||||
|
||||
/// <summary>
|
||||
/// Decide whether a session with <paramref name="userRoles"/> is allowed to write to an
|
||||
/// attribute with the given <paramref name="classification"/>. Returns true for
|
||||
/// <c>FreeAccess</c> regardless of roles (including empty / anonymous sessions) and
|
||||
/// false for <c>ViewOnly</c> regardless of roles. Every other classification requires
|
||||
/// the session to carry the mapped role — case-insensitive match.
|
||||
/// </summary>
|
||||
public static bool IsAllowed(SecurityClassification classification, IReadOnlyCollection<string> userRoles)
|
||||
{
|
||||
if (classification == SecurityClassification.FreeAccess) return true;
|
||||
if (classification == SecurityClassification.ViewOnly) return false;
|
||||
|
||||
var required = RequiredRole(classification);
|
||||
if (required is null) return false;
|
||||
|
||||
foreach (var r in userRoles)
|
||||
{
|
||||
if (string.Equals(r, required, StringComparison.OrdinalIgnoreCase))
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Required role for a classification, or null when no role grants access
|
||||
/// (<see cref="SecurityClassification.ViewOnly"/>) or no role is needed
|
||||
/// (<see cref="SecurityClassification.FreeAccess"/> — also returns null; callers use
|
||||
/// <see cref="IsAllowed"/> which handles the special-cases rather than branching on
|
||||
/// null themselves).
|
||||
/// </summary>
|
||||
public static string? RequiredRole(SecurityClassification classification) => classification switch
|
||||
{
|
||||
SecurityClassification.FreeAccess => null, // IsAllowed short-circuits
|
||||
SecurityClassification.Operate => RoleWriteOperate,
|
||||
SecurityClassification.SecuredWrite => RoleWriteOperate,
|
||||
SecurityClassification.Tune => RoleWriteTune,
|
||||
SecurityClassification.VerifiedWrite => RoleWriteConfigure,
|
||||
SecurityClassification.Configure => RoleWriteConfigure,
|
||||
SecurityClassification.ViewOnly => null, // IsAllowed short-circuits
|
||||
_ => null,
|
||||
};
|
||||
}
|
||||
@@ -24,6 +24,7 @@
|
||||
<PackageReference Include="OPCFoundation.NetStandard.Opc.Ua.Server" Version="1.5.374.126"/>
|
||||
<PackageReference Include="OPCFoundation.NetStandard.Opc.Ua.Configuration" Version="1.5.374.126"/>
|
||||
<PackageReference Include="Novell.Directory.Ldap.NETStandard" Version="3.6.0"/>
|
||||
<PackageReference Include="Microsoft.EntityFrameworkCore.SqlServer" Version="10.0.0"/>
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
|
||||
153
tests/ZB.MOM.WW.OtOpcUa.Admin.Tests/CertTrustServiceTests.cs
Normal file
153
tests/ZB.MOM.WW.OtOpcUa.Admin.Tests/CertTrustServiceTests.cs
Normal file
@@ -0,0 +1,153 @@
|
||||
using System.Security.Cryptography;
|
||||
using System.Security.Cryptography.X509Certificates;
|
||||
using Microsoft.Extensions.Logging.Abstractions;
|
||||
using Microsoft.Extensions.Options;
|
||||
using Shouldly;
|
||||
using Xunit;
|
||||
using ZB.MOM.WW.OtOpcUa.Admin.Services;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Admin.Tests;
|
||||
|
||||
[Trait("Category", "Unit")]
|
||||
public sealed class CertTrustServiceTests : IDisposable
|
||||
{
|
||||
private readonly string _root;
|
||||
|
||||
public CertTrustServiceTests()
|
||||
{
|
||||
_root = Path.Combine(Path.GetTempPath(), $"otopcua-cert-test-{Guid.NewGuid():N}");
|
||||
Directory.CreateDirectory(Path.Combine(_root, "rejected", "certs"));
|
||||
Directory.CreateDirectory(Path.Combine(_root, "trusted", "certs"));
|
||||
}
|
||||
|
||||
public void Dispose()
|
||||
{
|
||||
if (Directory.Exists(_root)) Directory.Delete(_root, recursive: true);
|
||||
}
|
||||
|
||||
private CertTrustService Service() => new(
|
||||
Options.Create(new CertTrustOptions { PkiStoreRoot = _root }),
|
||||
NullLogger<CertTrustService>.Instance);
|
||||
|
||||
private X509Certificate2 WriteTestCert(CertStoreKind kind, string subject)
|
||||
{
|
||||
using var rsa = RSA.Create(2048);
|
||||
var req = new CertificateRequest($"CN={subject}", rsa, HashAlgorithmName.SHA256, RSASignaturePadding.Pkcs1);
|
||||
var cert = req.CreateSelfSigned(DateTimeOffset.UtcNow.AddDays(-1), DateTimeOffset.UtcNow.AddYears(1));
|
||||
var dir = Path.Combine(_root, kind == CertStoreKind.Rejected ? "rejected" : "trusted", "certs");
|
||||
var path = Path.Combine(dir, $"{subject} [{cert.Thumbprint}].der");
|
||||
File.WriteAllBytes(path, cert.Export(X509ContentType.Cert));
|
||||
return cert;
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void ListRejected_returns_parsed_cert_info_for_each_der_in_rejected_certs_dir()
|
||||
{
|
||||
var c = WriteTestCert(CertStoreKind.Rejected, "test-client-A");
|
||||
|
||||
var rows = Service().ListRejected();
|
||||
|
||||
rows.Count.ShouldBe(1);
|
||||
rows[0].Thumbprint.ShouldBe(c.Thumbprint);
|
||||
rows[0].Subject.ShouldContain("test-client-A");
|
||||
rows[0].Store.ShouldBe(CertStoreKind.Rejected);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void ListTrusted_is_separate_from_rejected()
|
||||
{
|
||||
WriteTestCert(CertStoreKind.Rejected, "rej");
|
||||
WriteTestCert(CertStoreKind.Trusted, "trust");
|
||||
|
||||
var svc = Service();
|
||||
svc.ListRejected().Count.ShouldBe(1);
|
||||
svc.ListTrusted().Count.ShouldBe(1);
|
||||
svc.ListRejected()[0].Subject.ShouldContain("rej");
|
||||
svc.ListTrusted()[0].Subject.ShouldContain("trust");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TrustRejected_moves_file_from_rejected_to_trusted()
|
||||
{
|
||||
var c = WriteTestCert(CertStoreKind.Rejected, "promoteme");
|
||||
var svc = Service();
|
||||
|
||||
svc.TrustRejected(c.Thumbprint).ShouldBeTrue();
|
||||
|
||||
svc.ListRejected().ShouldBeEmpty();
|
||||
var trusted = svc.ListTrusted();
|
||||
trusted.Count.ShouldBe(1);
|
||||
trusted[0].Thumbprint.ShouldBe(c.Thumbprint);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TrustRejected_returns_false_when_thumbprint_not_in_rejected()
|
||||
{
|
||||
var svc = Service();
|
||||
svc.TrustRejected("00DEADBEEF00DEADBEEF00DEADBEEF00DEADBEEF").ShouldBeFalse();
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void DeleteRejected_removes_the_file()
|
||||
{
|
||||
var c = WriteTestCert(CertStoreKind.Rejected, "killme");
|
||||
var svc = Service();
|
||||
|
||||
svc.DeleteRejected(c.Thumbprint).ShouldBeTrue();
|
||||
svc.ListRejected().ShouldBeEmpty();
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void UntrustCert_removes_from_trusted_only()
|
||||
{
|
||||
var c = WriteTestCert(CertStoreKind.Trusted, "revoke");
|
||||
var svc = Service();
|
||||
|
||||
svc.UntrustCert(c.Thumbprint).ShouldBeTrue();
|
||||
svc.ListTrusted().ShouldBeEmpty();
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Thumbprint_match_is_case_insensitive()
|
||||
{
|
||||
var c = WriteTestCert(CertStoreKind.Rejected, "case");
|
||||
var svc = Service();
|
||||
|
||||
// X509Certificate2.Thumbprint is upper-case hex; operators pasting from logs often
|
||||
// lowercase it. IsAllowed-style case-insensitive match keeps the UX forgiving.
|
||||
svc.TrustRejected(c.Thumbprint.ToLowerInvariant()).ShouldBeTrue();
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Missing_store_directories_produce_empty_lists_not_exceptions()
|
||||
{
|
||||
// Fresh root with no certs subfolder — service should tolerate a pristine install.
|
||||
var altRoot = Path.Combine(Path.GetTempPath(), $"otopcua-cert-empty-{Guid.NewGuid():N}");
|
||||
try
|
||||
{
|
||||
var svc = new CertTrustService(
|
||||
Options.Create(new CertTrustOptions { PkiStoreRoot = altRoot }),
|
||||
NullLogger<CertTrustService>.Instance);
|
||||
svc.ListRejected().ShouldBeEmpty();
|
||||
svc.ListTrusted().ShouldBeEmpty();
|
||||
}
|
||||
finally
|
||||
{
|
||||
if (Directory.Exists(altRoot)) Directory.Delete(altRoot, recursive: true);
|
||||
}
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Malformed_file_is_skipped_not_fatal()
|
||||
{
|
||||
// Drop junk bytes that don't parse as a cert into the rejected/certs directory. The
|
||||
// service must skip it and still return the valid certs — one bad file can't take the
|
||||
// whole management page offline.
|
||||
File.WriteAllText(Path.Combine(_root, "rejected", "certs", "junk.der"), "not a cert");
|
||||
var c = WriteTestCert(CertStoreKind.Rejected, "valid");
|
||||
|
||||
var rows = Service().ListRejected();
|
||||
rows.Count.ShouldBe(1);
|
||||
rows[0].Thumbprint.ShouldBe(c.Thumbprint);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,128 @@
|
||||
using Microsoft.EntityFrameworkCore;
|
||||
using Shouldly;
|
||||
using Xunit;
|
||||
using ZB.MOM.WW.OtOpcUa.Configuration.Entities;
|
||||
using ZB.MOM.WW.OtOpcUa.Configuration.Enums;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Configuration.Tests;
|
||||
|
||||
/// <summary>
|
||||
/// End-to-end round-trip through the DB for the <see cref="DriverHostStatus"/> entity
|
||||
/// added in PR 33 — exercises the composite primary key (NodeId, DriverInstanceId,
|
||||
/// HostName), string-backed <c>DriverHostState</c> conversion, and the two indexes the
|
||||
/// Admin UI's drill-down queries will scan (NodeId, LastSeenUtc).
|
||||
/// </summary>
|
||||
[Trait("Category", "SchemaCompliance")]
|
||||
[Collection(nameof(SchemaComplianceCollection))]
|
||||
public sealed class DriverHostStatusTests(SchemaComplianceFixture fixture)
|
||||
{
|
||||
[Fact]
|
||||
public async Task Composite_key_allows_same_host_across_different_nodes_or_drivers()
|
||||
{
|
||||
await using var ctx = NewContext();
|
||||
|
||||
// Same HostName + DriverInstanceId across two different server nodes — classic 2-node
|
||||
// redundancy case. Both rows must be insertable because each server node owns its own
|
||||
// runtime view of the shared host.
|
||||
var now = DateTime.UtcNow;
|
||||
ctx.DriverHostStatuses.Add(new DriverHostStatus
|
||||
{
|
||||
NodeId = "node-a", DriverInstanceId = "galaxy-1", HostName = "GRPlatform",
|
||||
State = DriverHostState.Running,
|
||||
StateChangedUtc = now, LastSeenUtc = now,
|
||||
});
|
||||
ctx.DriverHostStatuses.Add(new DriverHostStatus
|
||||
{
|
||||
NodeId = "node-b", DriverInstanceId = "galaxy-1", HostName = "GRPlatform",
|
||||
State = DriverHostState.Stopped,
|
||||
StateChangedUtc = now, LastSeenUtc = now,
|
||||
Detail = "secondary hasn't taken over yet",
|
||||
});
|
||||
// Same server node + host, different driver instance — second driver doesn't clobber.
|
||||
ctx.DriverHostStatuses.Add(new DriverHostStatus
|
||||
{
|
||||
NodeId = "node-a", DriverInstanceId = "modbus-plc1", HostName = "GRPlatform",
|
||||
State = DriverHostState.Running,
|
||||
StateChangedUtc = now, LastSeenUtc = now,
|
||||
});
|
||||
await ctx.SaveChangesAsync();
|
||||
|
||||
var rows = await ctx.DriverHostStatuses.AsNoTracking()
|
||||
.Where(r => r.HostName == "GRPlatform").ToListAsync();
|
||||
|
||||
rows.Count.ShouldBe(3);
|
||||
rows.ShouldContain(r => r.NodeId == "node-a" && r.DriverInstanceId == "galaxy-1");
|
||||
rows.ShouldContain(r => r.NodeId == "node-b" && r.State == DriverHostState.Stopped && r.Detail == "secondary hasn't taken over yet");
|
||||
rows.ShouldContain(r => r.NodeId == "node-a" && r.DriverInstanceId == "modbus-plc1");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Upsert_pattern_for_same_key_updates_in_place()
|
||||
{
|
||||
// The publisher hosted service (follow-up PR) upserts on every transition +
|
||||
// heartbeat. This test pins the two-step pattern it will use: check-then-add-or-update
|
||||
// keyed on the composite PK. If the composite key ever changes, this test breaks
|
||||
// loudly so the publisher gets a synchronized update.
|
||||
await using var ctx = NewContext();
|
||||
var t0 = DateTime.UtcNow;
|
||||
ctx.DriverHostStatuses.Add(new DriverHostStatus
|
||||
{
|
||||
NodeId = "upsert-node", DriverInstanceId = "upsert-driver", HostName = "upsert-host",
|
||||
State = DriverHostState.Running,
|
||||
StateChangedUtc = t0, LastSeenUtc = t0,
|
||||
});
|
||||
await ctx.SaveChangesAsync();
|
||||
|
||||
var t1 = t0.AddSeconds(30);
|
||||
await using (var ctx2 = NewContext())
|
||||
{
|
||||
var existing = await ctx2.DriverHostStatuses.SingleAsync(r =>
|
||||
r.NodeId == "upsert-node" && r.DriverInstanceId == "upsert-driver" && r.HostName == "upsert-host");
|
||||
existing.State = DriverHostState.Faulted;
|
||||
existing.StateChangedUtc = t1;
|
||||
existing.LastSeenUtc = t1;
|
||||
existing.Detail = "transport reset by peer";
|
||||
await ctx2.SaveChangesAsync();
|
||||
}
|
||||
|
||||
await using var ctx3 = NewContext();
|
||||
var final = await ctx3.DriverHostStatuses.AsNoTracking().SingleAsync(r =>
|
||||
r.NodeId == "upsert-node" && r.HostName == "upsert-host");
|
||||
final.State.ShouldBe(DriverHostState.Faulted);
|
||||
final.Detail.ShouldBe("transport reset by peer");
|
||||
// Only one row — a naive "always insert" would have created a duplicate PK and thrown.
|
||||
(await ctx3.DriverHostStatuses.CountAsync(r => r.NodeId == "upsert-node")).ShouldBe(1);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Enum_persists_as_string_not_int()
|
||||
{
|
||||
// Fluent config sets HasConversion<string>() on State — the DB stores 'Running' /
|
||||
// 'Stopped' / 'Faulted' / 'Unknown' as nvarchar(16). Verify by reading the raw
|
||||
// string back via ADO; if someone drops the conversion the column will contain '1'
|
||||
// / '2' / '3' and this assertion fails. Matters because DBAs inspecting the table
|
||||
// directly should see readable state names, not enum ordinals.
|
||||
await using var ctx = NewContext();
|
||||
ctx.DriverHostStatuses.Add(new DriverHostStatus
|
||||
{
|
||||
NodeId = "enum-node", DriverInstanceId = "enum-driver", HostName = "enum-host",
|
||||
State = DriverHostState.Faulted,
|
||||
StateChangedUtc = DateTime.UtcNow, LastSeenUtc = DateTime.UtcNow,
|
||||
});
|
||||
await ctx.SaveChangesAsync();
|
||||
|
||||
await using var conn = fixture.OpenConnection();
|
||||
using var cmd = conn.CreateCommand();
|
||||
cmd.CommandText = "SELECT [State] FROM DriverHostStatus WHERE NodeId = 'enum-node'";
|
||||
var rawValue = (string?)await cmd.ExecuteScalarAsync();
|
||||
rawValue.ShouldBe("Faulted");
|
||||
}
|
||||
|
||||
private OtOpcUaConfigDbContext NewContext()
|
||||
{
|
||||
var options = new DbContextOptionsBuilder<OtOpcUaConfigDbContext>()
|
||||
.UseSqlServer(fixture.ConnectionString)
|
||||
.Options;
|
||||
return new OtOpcUaConfigDbContext(options);
|
||||
}
|
||||
}
|
||||
@@ -28,6 +28,7 @@ public sealed class SchemaComplianceTests
|
||||
"Namespace", "UnsArea", "UnsLine",
|
||||
"DriverInstance", "Device", "Equipment", "Tag", "PollGroup",
|
||||
"NodeAcl", "ExternalIdReservation",
|
||||
"DriverHostStatus",
|
||||
};
|
||||
|
||||
var actual = QueryStrings(@"
|
||||
|
||||
@@ -7,11 +7,13 @@ public sealed class DriverTypeRegistryTests
|
||||
{
|
||||
private static DriverTypeMetadata SampleMetadata(
|
||||
string typeName = "Modbus",
|
||||
NamespaceKindCompatibility allowed = NamespaceKindCompatibility.Equipment) =>
|
||||
NamespaceKindCompatibility allowed = NamespaceKindCompatibility.Equipment,
|
||||
DriverTier tier = DriverTier.B) =>
|
||||
new(typeName, allowed,
|
||||
DriverConfigJsonSchema: "{\"type\": \"object\"}",
|
||||
DeviceConfigJsonSchema: "{\"type\": \"object\"}",
|
||||
TagConfigJsonSchema: "{\"type\": \"object\"}");
|
||||
TagConfigJsonSchema: "{\"type\": \"object\"}",
|
||||
Tier: tier);
|
||||
|
||||
[Fact]
|
||||
public void Register_ThenGet_RoundTrips()
|
||||
@@ -24,6 +26,20 @@ public sealed class DriverTypeRegistryTests
|
||||
registry.Get("Modbus").ShouldBe(metadata);
|
||||
}
|
||||
|
||||
[Theory]
|
||||
[InlineData(DriverTier.A)]
|
||||
[InlineData(DriverTier.B)]
|
||||
[InlineData(DriverTier.C)]
|
||||
public void Register_Requires_NonNullTier(DriverTier tier)
|
||||
{
|
||||
var registry = new DriverTypeRegistry();
|
||||
var metadata = SampleMetadata(typeName: $"Driver-{tier}", tier: tier);
|
||||
|
||||
registry.Register(metadata);
|
||||
|
||||
registry.Get(metadata.TypeName).Tier.ShouldBe(tier);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Get_IsCaseInsensitive()
|
||||
{
|
||||
|
||||
@@ -0,0 +1,151 @@
|
||||
using Shouldly;
|
||||
using Xunit;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Resilience;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Core.Tests.Resilience;
|
||||
|
||||
[Trait("Category", "Unit")]
|
||||
public sealed class CapabilityInvokerTests
|
||||
{
|
||||
private static CapabilityInvoker MakeInvoker(
|
||||
DriverResiliencePipelineBuilder builder,
|
||||
DriverResilienceOptions options) =>
|
||||
new(builder, "drv-test", () => options);
|
||||
|
||||
[Fact]
|
||||
public async Task Read_ReturnsValue_FromCallSite()
|
||||
{
|
||||
var invoker = MakeInvoker(new DriverResiliencePipelineBuilder(), new DriverResilienceOptions { Tier = DriverTier.A });
|
||||
|
||||
var result = await invoker.ExecuteAsync(
|
||||
DriverCapability.Read,
|
||||
"host-1",
|
||||
_ => ValueTask.FromResult(42),
|
||||
CancellationToken.None);
|
||||
|
||||
result.ShouldBe(42);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Read_Retries_OnTransientFailure()
|
||||
{
|
||||
var invoker = MakeInvoker(new DriverResiliencePipelineBuilder(), new DriverResilienceOptions { Tier = DriverTier.A });
|
||||
var attempts = 0;
|
||||
|
||||
var result = await invoker.ExecuteAsync(
|
||||
DriverCapability.Read,
|
||||
"host-1",
|
||||
async _ =>
|
||||
{
|
||||
attempts++;
|
||||
if (attempts < 2) throw new InvalidOperationException("transient");
|
||||
await Task.Yield();
|
||||
return "ok";
|
||||
},
|
||||
CancellationToken.None);
|
||||
|
||||
result.ShouldBe("ok");
|
||||
attempts.ShouldBe(2);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Write_NonIdempotent_DoesNotRetry_EvenWhenPolicyHasRetries()
|
||||
{
|
||||
var options = new DriverResilienceOptions
|
||||
{
|
||||
Tier = DriverTier.A,
|
||||
CapabilityPolicies = new Dictionary<DriverCapability, CapabilityPolicy>
|
||||
{
|
||||
[DriverCapability.Write] = new(TimeoutSeconds: 2, RetryCount: 3, BreakerFailureThreshold: 5),
|
||||
},
|
||||
};
|
||||
var invoker = MakeInvoker(new DriverResiliencePipelineBuilder(), options);
|
||||
var attempts = 0;
|
||||
|
||||
await Should.ThrowAsync<InvalidOperationException>(async () =>
|
||||
await invoker.ExecuteWriteAsync(
|
||||
"host-1",
|
||||
isIdempotent: false,
|
||||
async _ =>
|
||||
{
|
||||
attempts++;
|
||||
await Task.Yield();
|
||||
throw new InvalidOperationException("boom");
|
||||
#pragma warning disable CS0162
|
||||
return 0;
|
||||
#pragma warning restore CS0162
|
||||
},
|
||||
CancellationToken.None));
|
||||
|
||||
attempts.ShouldBe(1, "non-idempotent write must never replay");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Write_Idempotent_Retries_WhenPolicyHasRetries()
|
||||
{
|
||||
var options = new DriverResilienceOptions
|
||||
{
|
||||
Tier = DriverTier.A,
|
||||
CapabilityPolicies = new Dictionary<DriverCapability, CapabilityPolicy>
|
||||
{
|
||||
[DriverCapability.Write] = new(TimeoutSeconds: 2, RetryCount: 3, BreakerFailureThreshold: 5),
|
||||
},
|
||||
};
|
||||
var invoker = MakeInvoker(new DriverResiliencePipelineBuilder(), options);
|
||||
var attempts = 0;
|
||||
|
||||
var result = await invoker.ExecuteWriteAsync(
|
||||
"host-1",
|
||||
isIdempotent: true,
|
||||
async _ =>
|
||||
{
|
||||
attempts++;
|
||||
if (attempts < 2) throw new InvalidOperationException("transient");
|
||||
await Task.Yield();
|
||||
return "ok";
|
||||
},
|
||||
CancellationToken.None);
|
||||
|
||||
result.ShouldBe("ok");
|
||||
attempts.ShouldBe(2);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Write_Default_DoesNotRetry_WhenPolicyHasZeroRetries()
|
||||
{
|
||||
// Tier A Write default is RetryCount=0. Even isIdempotent=true shouldn't retry
|
||||
// because the policy says not to.
|
||||
var invoker = MakeInvoker(new DriverResiliencePipelineBuilder(), new DriverResilienceOptions { Tier = DriverTier.A });
|
||||
var attempts = 0;
|
||||
|
||||
await Should.ThrowAsync<InvalidOperationException>(async () =>
|
||||
await invoker.ExecuteWriteAsync(
|
||||
"host-1",
|
||||
isIdempotent: true,
|
||||
async _ =>
|
||||
{
|
||||
attempts++;
|
||||
await Task.Yield();
|
||||
throw new InvalidOperationException("boom");
|
||||
#pragma warning disable CS0162
|
||||
return 0;
|
||||
#pragma warning restore CS0162
|
||||
},
|
||||
CancellationToken.None));
|
||||
|
||||
attempts.ShouldBe(1, "tier-A default for Write is RetryCount=0");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Execute_HonorsDifferentHosts_Independently()
|
||||
{
|
||||
var builder = new DriverResiliencePipelineBuilder();
|
||||
var invoker = MakeInvoker(builder, new DriverResilienceOptions { Tier = DriverTier.A });
|
||||
|
||||
await invoker.ExecuteAsync(DriverCapability.Read, "host-a", _ => ValueTask.FromResult(1), CancellationToken.None);
|
||||
await invoker.ExecuteAsync(DriverCapability.Read, "host-b", _ => ValueTask.FromResult(2), CancellationToken.None);
|
||||
|
||||
builder.CachedPipelineCount.ShouldBe(2);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,102 @@
|
||||
using Shouldly;
|
||||
using Xunit;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Resilience;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Core.Tests.Resilience;
|
||||
|
||||
[Trait("Category", "Unit")]
|
||||
public sealed class DriverResilienceOptionsTests
|
||||
{
|
||||
[Theory]
|
||||
[InlineData(DriverTier.A)]
|
||||
[InlineData(DriverTier.B)]
|
||||
[InlineData(DriverTier.C)]
|
||||
public void TierDefaults_Cover_EveryCapability(DriverTier tier)
|
||||
{
|
||||
var defaults = DriverResilienceOptions.GetTierDefaults(tier);
|
||||
|
||||
foreach (var capability in Enum.GetValues<DriverCapability>())
|
||||
defaults.ShouldContainKey(capability);
|
||||
}
|
||||
|
||||
[Theory]
|
||||
[InlineData(DriverTier.A)]
|
||||
[InlineData(DriverTier.B)]
|
||||
[InlineData(DriverTier.C)]
|
||||
public void Write_NeverRetries_ByDefault(DriverTier tier)
|
||||
{
|
||||
var defaults = DriverResilienceOptions.GetTierDefaults(tier);
|
||||
defaults[DriverCapability.Write].RetryCount.ShouldBe(0);
|
||||
}
|
||||
|
||||
[Theory]
|
||||
[InlineData(DriverTier.A)]
|
||||
[InlineData(DriverTier.B)]
|
||||
[InlineData(DriverTier.C)]
|
||||
public void AlarmAcknowledge_NeverRetries_ByDefault(DriverTier tier)
|
||||
{
|
||||
var defaults = DriverResilienceOptions.GetTierDefaults(tier);
|
||||
defaults[DriverCapability.AlarmAcknowledge].RetryCount.ShouldBe(0);
|
||||
}
|
||||
|
||||
[Theory]
|
||||
[InlineData(DriverTier.A, DriverCapability.Read)]
|
||||
[InlineData(DriverTier.A, DriverCapability.HistoryRead)]
|
||||
[InlineData(DriverTier.B, DriverCapability.Discover)]
|
||||
[InlineData(DriverTier.B, DriverCapability.Probe)]
|
||||
[InlineData(DriverTier.C, DriverCapability.AlarmSubscribe)]
|
||||
public void IdempotentCapabilities_Retry_ByDefault(DriverTier tier, DriverCapability capability)
|
||||
{
|
||||
var defaults = DriverResilienceOptions.GetTierDefaults(tier);
|
||||
defaults[capability].RetryCount.ShouldBeGreaterThan(0);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TierC_DisablesCircuitBreaker_DeferringToSupervisor()
|
||||
{
|
||||
var defaults = DriverResilienceOptions.GetTierDefaults(DriverTier.C);
|
||||
|
||||
foreach (var (_, policy) in defaults)
|
||||
policy.BreakerFailureThreshold.ShouldBe(0, "Tier C breaker is handled by the Proxy supervisor (decision #68)");
|
||||
}
|
||||
|
||||
[Theory]
|
||||
[InlineData(DriverTier.A)]
|
||||
[InlineData(DriverTier.B)]
|
||||
public void TierAAndB_EnableCircuitBreaker(DriverTier tier)
|
||||
{
|
||||
var defaults = DriverResilienceOptions.GetTierDefaults(tier);
|
||||
|
||||
foreach (var (_, policy) in defaults)
|
||||
policy.BreakerFailureThreshold.ShouldBeGreaterThan(0);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Resolve_Uses_TierDefaults_When_NoOverride()
|
||||
{
|
||||
var options = new DriverResilienceOptions { Tier = DriverTier.A };
|
||||
|
||||
var resolved = options.Resolve(DriverCapability.Read);
|
||||
|
||||
resolved.ShouldBe(DriverResilienceOptions.GetTierDefaults(DriverTier.A)[DriverCapability.Read]);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Resolve_Uses_Override_When_Configured()
|
||||
{
|
||||
var custom = new CapabilityPolicy(TimeoutSeconds: 42, RetryCount: 7, BreakerFailureThreshold: 9);
|
||||
var options = new DriverResilienceOptions
|
||||
{
|
||||
Tier = DriverTier.A,
|
||||
CapabilityPolicies = new Dictionary<DriverCapability, CapabilityPolicy>
|
||||
{
|
||||
[DriverCapability.Read] = custom,
|
||||
},
|
||||
};
|
||||
|
||||
options.Resolve(DriverCapability.Read).ShouldBe(custom);
|
||||
options.Resolve(DriverCapability.Write).ShouldBe(
|
||||
DriverResilienceOptions.GetTierDefaults(DriverTier.A)[DriverCapability.Write]);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,222 @@
|
||||
using Polly.CircuitBreaker;
|
||||
using Polly.Timeout;
|
||||
using Shouldly;
|
||||
using Xunit;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Resilience;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Core.Tests.Resilience;
|
||||
|
||||
[Trait("Category", "Unit")]
|
||||
public sealed class DriverResiliencePipelineBuilderTests
|
||||
{
|
||||
private static readonly DriverResilienceOptions TierAOptions = new() { Tier = DriverTier.A };
|
||||
|
||||
[Fact]
|
||||
public async Task Read_Retries_Transient_Failures()
|
||||
{
|
||||
var builder = new DriverResiliencePipelineBuilder();
|
||||
var pipeline = builder.GetOrCreate("drv-test", "host-1", DriverCapability.Read, TierAOptions);
|
||||
var attempts = 0;
|
||||
|
||||
await pipeline.ExecuteAsync(async _ =>
|
||||
{
|
||||
attempts++;
|
||||
if (attempts < 3) throw new InvalidOperationException("transient");
|
||||
await Task.Yield();
|
||||
});
|
||||
|
||||
attempts.ShouldBe(3);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Write_DoesNotRetry_OnFailure()
|
||||
{
|
||||
var builder = new DriverResiliencePipelineBuilder();
|
||||
var pipeline = builder.GetOrCreate("drv-test", "host-1", DriverCapability.Write, TierAOptions);
|
||||
var attempts = 0;
|
||||
|
||||
var ex = await Should.ThrowAsync<InvalidOperationException>(async () =>
|
||||
{
|
||||
await pipeline.ExecuteAsync(async _ =>
|
||||
{
|
||||
attempts++;
|
||||
await Task.Yield();
|
||||
throw new InvalidOperationException("boom");
|
||||
});
|
||||
});
|
||||
|
||||
attempts.ShouldBe(1);
|
||||
ex.Message.ShouldBe("boom");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task AlarmAcknowledge_DoesNotRetry_OnFailure()
|
||||
{
|
||||
var builder = new DriverResiliencePipelineBuilder();
|
||||
var pipeline = builder.GetOrCreate("drv-test", "host-1", DriverCapability.AlarmAcknowledge, TierAOptions);
|
||||
var attempts = 0;
|
||||
|
||||
await Should.ThrowAsync<InvalidOperationException>(async () =>
|
||||
{
|
||||
await pipeline.ExecuteAsync(async _ =>
|
||||
{
|
||||
attempts++;
|
||||
await Task.Yield();
|
||||
throw new InvalidOperationException("boom");
|
||||
});
|
||||
});
|
||||
|
||||
attempts.ShouldBe(1);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Pipeline_IsIsolated_PerHost()
|
||||
{
|
||||
var builder = new DriverResiliencePipelineBuilder();
|
||||
var driverId = "drv-test";
|
||||
|
||||
var hostA = builder.GetOrCreate(driverId, "host-a", DriverCapability.Read, TierAOptions);
|
||||
var hostB = builder.GetOrCreate(driverId, "host-b", DriverCapability.Read, TierAOptions);
|
||||
|
||||
hostA.ShouldNotBeSameAs(hostB);
|
||||
builder.CachedPipelineCount.ShouldBe(2);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Pipeline_IsReused_ForSameTriple()
|
||||
{
|
||||
var builder = new DriverResiliencePipelineBuilder();
|
||||
var driverId = "drv-test";
|
||||
|
||||
var first = builder.GetOrCreate(driverId, "host-a", DriverCapability.Read, TierAOptions);
|
||||
var second = builder.GetOrCreate(driverId, "host-a", DriverCapability.Read, TierAOptions);
|
||||
|
||||
first.ShouldBeSameAs(second);
|
||||
builder.CachedPipelineCount.ShouldBe(1);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Pipeline_IsIsolated_PerCapability()
|
||||
{
|
||||
var builder = new DriverResiliencePipelineBuilder();
|
||||
var driverId = "drv-test";
|
||||
|
||||
var read = builder.GetOrCreate(driverId, "host-a", DriverCapability.Read, TierAOptions);
|
||||
var write = builder.GetOrCreate(driverId, "host-a", DriverCapability.Write, TierAOptions);
|
||||
|
||||
read.ShouldNotBeSameAs(write);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task DeadHost_DoesNotOpenBreaker_ForSiblingHost()
|
||||
{
|
||||
var builder = new DriverResiliencePipelineBuilder();
|
||||
var driverId = "drv-test";
|
||||
|
||||
var deadHost = builder.GetOrCreate(driverId, "dead-plc", DriverCapability.Read, TierAOptions);
|
||||
var liveHost = builder.GetOrCreate(driverId, "live-plc", DriverCapability.Read, TierAOptions);
|
||||
|
||||
var threshold = TierAOptions.Resolve(DriverCapability.Read).BreakerFailureThreshold;
|
||||
for (var i = 0; i < threshold + 5; i++)
|
||||
{
|
||||
await Should.ThrowAsync<Exception>(async () =>
|
||||
await deadHost.ExecuteAsync(async _ =>
|
||||
{
|
||||
await Task.Yield();
|
||||
throw new InvalidOperationException("dead plc");
|
||||
}));
|
||||
}
|
||||
|
||||
var liveAttempts = 0;
|
||||
await liveHost.ExecuteAsync(async _ =>
|
||||
{
|
||||
liveAttempts++;
|
||||
await Task.Yield();
|
||||
});
|
||||
|
||||
liveAttempts.ShouldBe(1, "healthy sibling host must not be affected by dead peer");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task CircuitBreaker_Opens_AfterFailureThreshold_OnTierA()
|
||||
{
|
||||
var builder = new DriverResiliencePipelineBuilder();
|
||||
var pipeline = builder.GetOrCreate("drv-test", "host-1", DriverCapability.Write, TierAOptions);
|
||||
|
||||
var threshold = TierAOptions.Resolve(DriverCapability.Write).BreakerFailureThreshold;
|
||||
for (var i = 0; i < threshold; i++)
|
||||
{
|
||||
await Should.ThrowAsync<InvalidOperationException>(async () =>
|
||||
await pipeline.ExecuteAsync(async _ =>
|
||||
{
|
||||
await Task.Yield();
|
||||
throw new InvalidOperationException("boom");
|
||||
}));
|
||||
}
|
||||
|
||||
await Should.ThrowAsync<BrokenCircuitException>(async () =>
|
||||
await pipeline.ExecuteAsync(async _ =>
|
||||
{
|
||||
await Task.Yield();
|
||||
}));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Timeout_Cancels_SlowOperation()
|
||||
{
|
||||
var tierAWithShortTimeout = new DriverResilienceOptions
|
||||
{
|
||||
Tier = DriverTier.A,
|
||||
CapabilityPolicies = new Dictionary<DriverCapability, CapabilityPolicy>
|
||||
{
|
||||
[DriverCapability.Read] = new(TimeoutSeconds: 1, RetryCount: 0, BreakerFailureThreshold: 5),
|
||||
},
|
||||
};
|
||||
var builder = new DriverResiliencePipelineBuilder();
|
||||
var pipeline = builder.GetOrCreate("drv-test", "host-1", DriverCapability.Read, tierAWithShortTimeout);
|
||||
|
||||
await Should.ThrowAsync<TimeoutRejectedException>(async () =>
|
||||
await pipeline.ExecuteAsync(async ct =>
|
||||
{
|
||||
await Task.Delay(TimeSpan.FromSeconds(5), ct);
|
||||
}));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Invalidate_Removes_OnlyMatchingInstance()
|
||||
{
|
||||
var builder = new DriverResiliencePipelineBuilder();
|
||||
var keepId = "drv-keep";
|
||||
var dropId = "drv-drop";
|
||||
|
||||
builder.GetOrCreate(keepId, "h", DriverCapability.Read, TierAOptions);
|
||||
builder.GetOrCreate(keepId, "h", DriverCapability.Write, TierAOptions);
|
||||
builder.GetOrCreate(dropId, "h", DriverCapability.Read, TierAOptions);
|
||||
|
||||
var removed = builder.Invalidate(dropId);
|
||||
|
||||
removed.ShouldBe(1);
|
||||
builder.CachedPipelineCount.ShouldBe(2);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Cancellation_IsNot_Retried()
|
||||
{
|
||||
var builder = new DriverResiliencePipelineBuilder();
|
||||
var pipeline = builder.GetOrCreate("drv-test", "host-1", DriverCapability.Read, TierAOptions);
|
||||
var attempts = 0;
|
||||
using var cts = new CancellationTokenSource();
|
||||
cts.Cancel();
|
||||
|
||||
await Should.ThrowAsync<OperationCanceledException>(async () =>
|
||||
await pipeline.ExecuteAsync(async ct =>
|
||||
{
|
||||
attempts++;
|
||||
ct.ThrowIfCancellationRequested();
|
||||
await Task.Yield();
|
||||
}, cts.Token));
|
||||
|
||||
attempts.ShouldBeLessThanOrEqualTo(1);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,160 @@
|
||||
using Shouldly;
|
||||
using Xunit;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Resilience;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Core.Tests.Resilience;
|
||||
|
||||
/// <summary>
|
||||
/// Integration tests for the Phase 6.1 Stream A.5 contract — wrapping a flaky
|
||||
/// <see cref="IReadable"/> / <see cref="IWritable"/> through the <see cref="CapabilityInvoker"/>.
|
||||
/// Exercises the three scenarios the plan enumerates: transient read succeeds after N
|
||||
/// retries; non-idempotent write fails after one attempt; idempotent write retries through.
|
||||
/// </summary>
|
||||
[Trait("Category", "Integration")]
|
||||
public sealed class FlakeyDriverIntegrationTests
|
||||
{
|
||||
[Fact]
|
||||
public async Task Read_SurfacesSuccess_AfterTransientFailures()
|
||||
{
|
||||
var flaky = new FlakeyDriver(failReadsBeforeIndex: 5);
|
||||
var options = new DriverResilienceOptions
|
||||
{
|
||||
Tier = DriverTier.A,
|
||||
CapabilityPolicies = new Dictionary<DriverCapability, CapabilityPolicy>
|
||||
{
|
||||
// TimeoutSeconds=30 gives slack for 5 exponential-backoff retries under
|
||||
// parallel-test-execution CPU pressure; 10 retries at the default Delay=100ms
|
||||
// exponential can otherwise exceed a 2-second budget intermittently.
|
||||
[DriverCapability.Read] = new(TimeoutSeconds: 30, RetryCount: 10, BreakerFailureThreshold: 50),
|
||||
},
|
||||
};
|
||||
var invoker = new CapabilityInvoker(new DriverResiliencePipelineBuilder(), "drv-test", () => options);
|
||||
|
||||
var result = await invoker.ExecuteAsync(
|
||||
DriverCapability.Read,
|
||||
"host-1",
|
||||
async ct => await flaky.ReadAsync(["tag-a"], ct),
|
||||
CancellationToken.None);
|
||||
|
||||
flaky.ReadAttempts.ShouldBe(6);
|
||||
result[0].StatusCode.ShouldBe(0u);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Write_NonIdempotent_FailsOnFirstFailure_NoReplay()
|
||||
{
|
||||
var flaky = new FlakeyDriver(failWritesBeforeIndex: 3);
|
||||
var optionsWithAggressiveRetry = new DriverResilienceOptions
|
||||
{
|
||||
Tier = DriverTier.A,
|
||||
CapabilityPolicies = new Dictionary<DriverCapability, CapabilityPolicy>
|
||||
{
|
||||
[DriverCapability.Write] = new(TimeoutSeconds: 2, RetryCount: 5, BreakerFailureThreshold: 50),
|
||||
},
|
||||
};
|
||||
var invoker = new CapabilityInvoker(new DriverResiliencePipelineBuilder(), "drv-test", () => optionsWithAggressiveRetry);
|
||||
|
||||
await Should.ThrowAsync<InvalidOperationException>(async () =>
|
||||
await invoker.ExecuteWriteAsync(
|
||||
"host-1",
|
||||
isIdempotent: false,
|
||||
async ct => await flaky.WriteAsync([new WriteRequest("pulse-coil", true)], ct),
|
||||
CancellationToken.None));
|
||||
|
||||
flaky.WriteAttempts.ShouldBe(1, "non-idempotent write must never replay (decision #44)");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Write_Idempotent_RetriesUntilSuccess()
|
||||
{
|
||||
var flaky = new FlakeyDriver(failWritesBeforeIndex: 2);
|
||||
var optionsWithRetry = new DriverResilienceOptions
|
||||
{
|
||||
Tier = DriverTier.A,
|
||||
CapabilityPolicies = new Dictionary<DriverCapability, CapabilityPolicy>
|
||||
{
|
||||
[DriverCapability.Write] = new(TimeoutSeconds: 2, RetryCount: 5, BreakerFailureThreshold: 50),
|
||||
},
|
||||
};
|
||||
var invoker = new CapabilityInvoker(new DriverResiliencePipelineBuilder(), "drv-test", () => optionsWithRetry);
|
||||
|
||||
var results = await invoker.ExecuteWriteAsync(
|
||||
"host-1",
|
||||
isIdempotent: true,
|
||||
async ct => await flaky.WriteAsync([new WriteRequest("set-point", 42.0f)], ct),
|
||||
CancellationToken.None);
|
||||
|
||||
flaky.WriteAttempts.ShouldBe(3);
|
||||
results[0].StatusCode.ShouldBe(0u);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task MultipleHosts_OnOneDriver_HaveIndependentFailureCounts()
|
||||
{
|
||||
var flaky = new FlakeyDriver(failReadsBeforeIndex: 0);
|
||||
var options = new DriverResilienceOptions { Tier = DriverTier.A };
|
||||
var builder = new DriverResiliencePipelineBuilder();
|
||||
var invoker = new CapabilityInvoker(builder, "drv-test", () => options);
|
||||
|
||||
// host-dead: force many failures to exhaust retries + trip breaker
|
||||
var threshold = options.Resolve(DriverCapability.Read).BreakerFailureThreshold;
|
||||
for (var i = 0; i < threshold + 5; i++)
|
||||
{
|
||||
await Should.ThrowAsync<Exception>(async () =>
|
||||
await invoker.ExecuteAsync(DriverCapability.Read, "host-dead",
|
||||
_ => throw new InvalidOperationException("dead"),
|
||||
CancellationToken.None));
|
||||
}
|
||||
|
||||
// host-live: succeeds on first call — unaffected by the dead-host breaker
|
||||
var liveAttempts = 0;
|
||||
await invoker.ExecuteAsync(DriverCapability.Read, "host-live",
|
||||
_ => { liveAttempts++; return ValueTask.FromResult("ok"); },
|
||||
CancellationToken.None);
|
||||
|
||||
liveAttempts.ShouldBe(1);
|
||||
}
|
||||
|
||||
private sealed class FlakeyDriver : IReadable, IWritable
|
||||
{
|
||||
private readonly int _failReadsBeforeIndex;
|
||||
private readonly int _failWritesBeforeIndex;
|
||||
|
||||
public int ReadAttempts { get; private set; }
|
||||
public int WriteAttempts { get; private set; }
|
||||
|
||||
public FlakeyDriver(int failReadsBeforeIndex = 0, int failWritesBeforeIndex = 0)
|
||||
{
|
||||
_failReadsBeforeIndex = failReadsBeforeIndex;
|
||||
_failWritesBeforeIndex = failWritesBeforeIndex;
|
||||
}
|
||||
|
||||
public Task<IReadOnlyList<DataValueSnapshot>> ReadAsync(
|
||||
IReadOnlyList<string> fullReferences,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
var attempt = ++ReadAttempts;
|
||||
if (attempt <= _failReadsBeforeIndex)
|
||||
throw new InvalidOperationException($"transient read failure #{attempt}");
|
||||
|
||||
var now = DateTime.UtcNow;
|
||||
IReadOnlyList<DataValueSnapshot> result = fullReferences
|
||||
.Select(_ => new DataValueSnapshot(Value: 0, StatusCode: 0u, SourceTimestampUtc: now, ServerTimestampUtc: now))
|
||||
.ToList();
|
||||
return Task.FromResult(result);
|
||||
}
|
||||
|
||||
public Task<IReadOnlyList<WriteResult>> WriteAsync(
|
||||
IReadOnlyList<WriteRequest> writes,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
var attempt = ++WriteAttempts;
|
||||
if (attempt <= _failWritesBeforeIndex)
|
||||
throw new InvalidOperationException($"transient write failure #{attempt}");
|
||||
|
||||
IReadOnlyList<WriteResult> result = writes.Select(_ => new WriteResult(0u)).ToList();
|
||||
return Task.FromResult(result);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,91 @@
|
||||
using Microsoft.Extensions.Logging.Abstractions;
|
||||
using Shouldly;
|
||||
using Xunit;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Stability;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Core.Tests.Stability;
|
||||
|
||||
[Trait("Category", "Unit")]
|
||||
public sealed class MemoryRecycleTests
|
||||
{
|
||||
[Fact]
|
||||
public async Task TierC_HardBreach_RequestsSupervisorRecycle()
|
||||
{
|
||||
var supervisor = new FakeSupervisor();
|
||||
var recycle = new MemoryRecycle(DriverTier.C, supervisor, NullLogger<MemoryRecycle>.Instance);
|
||||
|
||||
var requested = await recycle.HandleAsync(MemoryTrackingAction.HardBreach, 2_000_000_000, CancellationToken.None);
|
||||
|
||||
requested.ShouldBeTrue();
|
||||
supervisor.RecycleCount.ShouldBe(1);
|
||||
supervisor.LastReason.ShouldContain("hard-breach");
|
||||
}
|
||||
|
||||
[Theory]
|
||||
[InlineData(DriverTier.A)]
|
||||
[InlineData(DriverTier.B)]
|
||||
public async Task InProcessTier_HardBreach_NeverRequestsRecycle(DriverTier tier)
|
||||
{
|
||||
var supervisor = new FakeSupervisor();
|
||||
var recycle = new MemoryRecycle(tier, supervisor, NullLogger<MemoryRecycle>.Instance);
|
||||
|
||||
var requested = await recycle.HandleAsync(MemoryTrackingAction.HardBreach, 2_000_000_000, CancellationToken.None);
|
||||
|
||||
requested.ShouldBeFalse("Tier A/B hard-breach logs a promotion recommendation only (decisions #74, #145)");
|
||||
supervisor.RecycleCount.ShouldBe(0);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task TierC_WithoutSupervisor_HardBreach_NoOp()
|
||||
{
|
||||
var recycle = new MemoryRecycle(DriverTier.C, supervisor: null, NullLogger<MemoryRecycle>.Instance);
|
||||
|
||||
var requested = await recycle.HandleAsync(MemoryTrackingAction.HardBreach, 2_000_000_000, CancellationToken.None);
|
||||
|
||||
requested.ShouldBeFalse("no supervisor → no recycle path; action logged only");
|
||||
}
|
||||
|
||||
[Theory]
|
||||
[InlineData(DriverTier.A)]
|
||||
[InlineData(DriverTier.B)]
|
||||
[InlineData(DriverTier.C)]
|
||||
public async Task SoftBreach_NeverRequestsRecycle(DriverTier tier)
|
||||
{
|
||||
var supervisor = new FakeSupervisor();
|
||||
var recycle = new MemoryRecycle(tier, supervisor, NullLogger<MemoryRecycle>.Instance);
|
||||
|
||||
var requested = await recycle.HandleAsync(MemoryTrackingAction.SoftBreach, 1_000_000_000, CancellationToken.None);
|
||||
|
||||
requested.ShouldBeFalse("soft-breach is surface-only at every tier");
|
||||
supervisor.RecycleCount.ShouldBe(0);
|
||||
}
|
||||
|
||||
[Theory]
|
||||
[InlineData(MemoryTrackingAction.None)]
|
||||
[InlineData(MemoryTrackingAction.Warming)]
|
||||
public async Task NonBreachActions_NoOp(MemoryTrackingAction action)
|
||||
{
|
||||
var supervisor = new FakeSupervisor();
|
||||
var recycle = new MemoryRecycle(DriverTier.C, supervisor, NullLogger<MemoryRecycle>.Instance);
|
||||
|
||||
var requested = await recycle.HandleAsync(action, 100_000_000, CancellationToken.None);
|
||||
|
||||
requested.ShouldBeFalse();
|
||||
supervisor.RecycleCount.ShouldBe(0);
|
||||
}
|
||||
|
||||
private sealed class FakeSupervisor : IDriverSupervisor
|
||||
{
|
||||
public string DriverInstanceId => "fake-tier-c";
|
||||
public int RecycleCount { get; private set; }
|
||||
public string? LastReason { get; private set; }
|
||||
|
||||
public Task RecycleAsync(string reason, CancellationToken cancellationToken)
|
||||
{
|
||||
RecycleCount++;
|
||||
LastReason = reason;
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,119 @@
|
||||
using Shouldly;
|
||||
using Xunit;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Stability;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Core.Tests.Stability;
|
||||
|
||||
[Trait("Category", "Unit")]
|
||||
public sealed class MemoryTrackingTests
|
||||
{
|
||||
private static readonly DateTime T0 = new(2026, 4, 19, 12, 0, 0, DateTimeKind.Utc);
|
||||
|
||||
[Fact]
|
||||
public void WarmingUp_Returns_Warming_UntilWindowElapses()
|
||||
{
|
||||
var tracker = new MemoryTracking(DriverTier.A, TimeSpan.FromMinutes(5));
|
||||
|
||||
tracker.Sample(100_000_000, T0).ShouldBe(MemoryTrackingAction.Warming);
|
||||
tracker.Sample(105_000_000, T0.AddMinutes(1)).ShouldBe(MemoryTrackingAction.Warming);
|
||||
tracker.Sample(102_000_000, T0.AddMinutes(4.9)).ShouldBe(MemoryTrackingAction.Warming);
|
||||
|
||||
tracker.Phase.ShouldBe(TrackingPhase.WarmingUp);
|
||||
tracker.BaselineBytes.ShouldBe(0);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void WindowElapsed_CapturesBaselineAsMedian_AndTransitionsToSteady()
|
||||
{
|
||||
var tracker = new MemoryTracking(DriverTier.A, TimeSpan.FromMinutes(5));
|
||||
|
||||
tracker.Sample(100_000_000, T0);
|
||||
tracker.Sample(200_000_000, T0.AddMinutes(1));
|
||||
tracker.Sample(150_000_000, T0.AddMinutes(2));
|
||||
var first = tracker.Sample(150_000_000, T0.AddMinutes(5));
|
||||
|
||||
tracker.Phase.ShouldBe(TrackingPhase.Steady);
|
||||
tracker.BaselineBytes.ShouldBe(150_000_000L, "median of 4 samples [100, 200, 150, 150] = (150+150)/2 = 150");
|
||||
first.ShouldBe(MemoryTrackingAction.None, "150 MB is the baseline itself, well under soft threshold");
|
||||
}
|
||||
|
||||
[Theory]
|
||||
[InlineData(DriverTier.A, 3, 50)]
|
||||
[InlineData(DriverTier.B, 3, 100)]
|
||||
[InlineData(DriverTier.C, 2, 500)]
|
||||
public void GetTierConstants_MatchesDecision146(DriverTier tier, int expectedMultiplier, long expectedFloorMB)
|
||||
{
|
||||
var (multiplier, floor) = MemoryTracking.GetTierConstants(tier);
|
||||
multiplier.ShouldBe(expectedMultiplier);
|
||||
floor.ShouldBe(expectedFloorMB * 1024 * 1024);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void SoftThreshold_UsesMax_OfMultiplierAndFloor_SmallBaseline()
|
||||
{
|
||||
// Tier A: mult=3, floor=50 MB. Baseline 10 MB → 3×10=30 MB < 10+50=60 MB → floor wins.
|
||||
var tracker = WarmupWithBaseline(DriverTier.A, 10L * 1024 * 1024);
|
||||
tracker.SoftThresholdBytes.ShouldBe(60L * 1024 * 1024);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void SoftThreshold_UsesMax_OfMultiplierAndFloor_LargeBaseline()
|
||||
{
|
||||
// Tier A: mult=3, floor=50 MB. Baseline 200 MB → 3×200=600 MB > 200+50=250 MB → multiplier wins.
|
||||
var tracker = WarmupWithBaseline(DriverTier.A, 200L * 1024 * 1024);
|
||||
tracker.SoftThresholdBytes.ShouldBe(600L * 1024 * 1024);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void HardThreshold_IsTwiceSoft()
|
||||
{
|
||||
var tracker = WarmupWithBaseline(DriverTier.B, 200L * 1024 * 1024);
|
||||
tracker.HardThresholdBytes.ShouldBe(tracker.SoftThresholdBytes * 2);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Sample_Below_Soft_Returns_None()
|
||||
{
|
||||
var tracker = WarmupWithBaseline(DriverTier.A, 100L * 1024 * 1024);
|
||||
|
||||
tracker.Sample(200L * 1024 * 1024, T0.AddMinutes(10)).ShouldBe(MemoryTrackingAction.None);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Sample_AtSoft_Returns_SoftBreach()
|
||||
{
|
||||
// Tier A, baseline 200 MB → soft = 600 MB. Sample exactly at soft.
|
||||
var tracker = WarmupWithBaseline(DriverTier.A, 200L * 1024 * 1024);
|
||||
|
||||
tracker.Sample(tracker.SoftThresholdBytes, T0.AddMinutes(10))
|
||||
.ShouldBe(MemoryTrackingAction.SoftBreach);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Sample_AtHard_Returns_HardBreach()
|
||||
{
|
||||
var tracker = WarmupWithBaseline(DriverTier.A, 200L * 1024 * 1024);
|
||||
|
||||
tracker.Sample(tracker.HardThresholdBytes, T0.AddMinutes(10))
|
||||
.ShouldBe(MemoryTrackingAction.HardBreach);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Sample_AboveHard_Returns_HardBreach()
|
||||
{
|
||||
var tracker = WarmupWithBaseline(DriverTier.A, 200L * 1024 * 1024);
|
||||
|
||||
tracker.Sample(tracker.HardThresholdBytes + 100_000_000, T0.AddMinutes(10))
|
||||
.ShouldBe(MemoryTrackingAction.HardBreach);
|
||||
}
|
||||
|
||||
private static MemoryTracking WarmupWithBaseline(DriverTier tier, long baseline)
|
||||
{
|
||||
var tracker = new MemoryTracking(tier, TimeSpan.FromMinutes(5));
|
||||
tracker.Sample(baseline, T0);
|
||||
tracker.Sample(baseline, T0.AddMinutes(5));
|
||||
tracker.BaselineBytes.ShouldBe(baseline);
|
||||
return tracker;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,101 @@
|
||||
using Microsoft.Extensions.Logging.Abstractions;
|
||||
using Shouldly;
|
||||
using Xunit;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Stability;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Core.Tests.Stability;
|
||||
|
||||
[Trait("Category", "Unit")]
|
||||
public sealed class ScheduledRecycleSchedulerTests
|
||||
{
|
||||
private static readonly DateTime T0 = new(2026, 4, 19, 0, 0, 0, DateTimeKind.Utc);
|
||||
private static readonly TimeSpan Weekly = TimeSpan.FromDays(7);
|
||||
|
||||
[Theory]
|
||||
[InlineData(DriverTier.A)]
|
||||
[InlineData(DriverTier.B)]
|
||||
public void TierAOrB_Ctor_Throws(DriverTier tier)
|
||||
{
|
||||
var supervisor = new FakeSupervisor();
|
||||
Should.Throw<ArgumentException>(() => new ScheduledRecycleScheduler(
|
||||
tier, Weekly, T0, supervisor, NullLogger<ScheduledRecycleScheduler>.Instance));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void ZeroOrNegativeInterval_Throws()
|
||||
{
|
||||
var supervisor = new FakeSupervisor();
|
||||
Should.Throw<ArgumentException>(() => new ScheduledRecycleScheduler(
|
||||
DriverTier.C, TimeSpan.Zero, T0, supervisor, NullLogger<ScheduledRecycleScheduler>.Instance));
|
||||
Should.Throw<ArgumentException>(() => new ScheduledRecycleScheduler(
|
||||
DriverTier.C, TimeSpan.FromSeconds(-1), T0, supervisor, NullLogger<ScheduledRecycleScheduler>.Instance));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Tick_BeforeNextRecycle_NoOp()
|
||||
{
|
||||
var supervisor = new FakeSupervisor();
|
||||
var sch = new ScheduledRecycleScheduler(DriverTier.C, Weekly, T0, supervisor, NullLogger<ScheduledRecycleScheduler>.Instance);
|
||||
|
||||
var fired = await sch.TickAsync(T0 + TimeSpan.FromDays(6), CancellationToken.None);
|
||||
|
||||
fired.ShouldBeFalse();
|
||||
supervisor.RecycleCount.ShouldBe(0);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Tick_AtOrAfterNextRecycle_FiresOnce_AndAdvances()
|
||||
{
|
||||
var supervisor = new FakeSupervisor();
|
||||
var sch = new ScheduledRecycleScheduler(DriverTier.C, Weekly, T0, supervisor, NullLogger<ScheduledRecycleScheduler>.Instance);
|
||||
|
||||
var fired = await sch.TickAsync(T0 + Weekly + TimeSpan.FromMinutes(1), CancellationToken.None);
|
||||
|
||||
fired.ShouldBeTrue();
|
||||
supervisor.RecycleCount.ShouldBe(1);
|
||||
sch.NextRecycleUtc.ShouldBe(T0 + Weekly + Weekly);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task RequestRecycleNow_Fires_Immediately_WithoutAdvancingSchedule()
|
||||
{
|
||||
var supervisor = new FakeSupervisor();
|
||||
var sch = new ScheduledRecycleScheduler(DriverTier.C, Weekly, T0, supervisor, NullLogger<ScheduledRecycleScheduler>.Instance);
|
||||
var nextBefore = sch.NextRecycleUtc;
|
||||
|
||||
await sch.RequestRecycleNowAsync("memory hard-breach", CancellationToken.None);
|
||||
|
||||
supervisor.RecycleCount.ShouldBe(1);
|
||||
supervisor.LastReason.ShouldBe("memory hard-breach");
|
||||
sch.NextRecycleUtc.ShouldBe(nextBefore, "ad-hoc recycle doesn't shift the cron schedule");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task MultipleFires_AcrossTicks_AdvanceOneIntervalEach()
|
||||
{
|
||||
var supervisor = new FakeSupervisor();
|
||||
var sch = new ScheduledRecycleScheduler(DriverTier.C, TimeSpan.FromDays(1), T0, supervisor, NullLogger<ScheduledRecycleScheduler>.Instance);
|
||||
|
||||
await sch.TickAsync(T0 + TimeSpan.FromDays(1) + TimeSpan.FromHours(1), CancellationToken.None);
|
||||
await sch.TickAsync(T0 + TimeSpan.FromDays(2) + TimeSpan.FromHours(1), CancellationToken.None);
|
||||
await sch.TickAsync(T0 + TimeSpan.FromDays(3) + TimeSpan.FromHours(1), CancellationToken.None);
|
||||
|
||||
supervisor.RecycleCount.ShouldBe(3);
|
||||
sch.NextRecycleUtc.ShouldBe(T0 + TimeSpan.FromDays(4));
|
||||
}
|
||||
|
||||
private sealed class FakeSupervisor : IDriverSupervisor
|
||||
{
|
||||
public string DriverInstanceId => "tier-c-fake";
|
||||
public int RecycleCount { get; private set; }
|
||||
public string? LastReason { get; private set; }
|
||||
|
||||
public Task RecycleAsync(string reason, CancellationToken cancellationToken)
|
||||
{
|
||||
RecycleCount++;
|
||||
LastReason = reason;
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,112 @@
|
||||
using Shouldly;
|
||||
using Xunit;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Stability;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Core.Tests.Stability;
|
||||
|
||||
[Trait("Category", "Unit")]
|
||||
public sealed class WedgeDetectorTests
|
||||
{
|
||||
private static readonly DateTime Now = new(2026, 4, 19, 12, 0, 0, DateTimeKind.Utc);
|
||||
private static readonly TimeSpan Threshold = TimeSpan.FromSeconds(120);
|
||||
|
||||
[Fact]
|
||||
public void SubSixtySecondThreshold_ClampsToSixty()
|
||||
{
|
||||
var detector = new WedgeDetector(TimeSpan.FromSeconds(10));
|
||||
detector.Threshold.ShouldBe(TimeSpan.FromSeconds(60));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Unhealthy_Driver_AlwaysNotApplicable()
|
||||
{
|
||||
var detector = new WedgeDetector(Threshold);
|
||||
var demand = new DemandSignal(BulkheadDepth: 5, ActiveMonitoredItems: 10, QueuedHistoryReads: 0, LastProgressUtc: Now.AddMinutes(-10));
|
||||
|
||||
detector.Classify(DriverState.Faulted, demand, Now).ShouldBe(WedgeVerdict.NotApplicable);
|
||||
detector.Classify(DriverState.Degraded, demand, Now).ShouldBe(WedgeVerdict.NotApplicable);
|
||||
detector.Classify(DriverState.Initializing, demand, Now).ShouldBe(WedgeVerdict.NotApplicable);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Idle_Subscription_Only_StaysIdle()
|
||||
{
|
||||
// Idle driver: bulkhead 0, monitored items 0, no history reads queued.
|
||||
// Even if LastProgressUtc is ancient, the verdict is Idle, not Faulted.
|
||||
var detector = new WedgeDetector(Threshold);
|
||||
var demand = new DemandSignal(0, 0, 0, Now.AddHours(-12));
|
||||
|
||||
detector.Classify(DriverState.Healthy, demand, Now).ShouldBe(WedgeVerdict.Idle);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void PendingWork_WithRecentProgress_StaysHealthy()
|
||||
{
|
||||
var detector = new WedgeDetector(Threshold);
|
||||
var demand = new DemandSignal(BulkheadDepth: 2, ActiveMonitoredItems: 0, QueuedHistoryReads: 0, LastProgressUtc: Now.AddSeconds(-30));
|
||||
|
||||
detector.Classify(DriverState.Healthy, demand, Now).ShouldBe(WedgeVerdict.Healthy);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void PendingWork_WithStaleProgress_IsFaulted()
|
||||
{
|
||||
var detector = new WedgeDetector(Threshold);
|
||||
var demand = new DemandSignal(BulkheadDepth: 2, ActiveMonitoredItems: 0, QueuedHistoryReads: 0, LastProgressUtc: Now.AddMinutes(-5));
|
||||
|
||||
detector.Classify(DriverState.Healthy, demand, Now).ShouldBe(WedgeVerdict.Faulted);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void MonitoredItems_Active_ButNoRecentPublish_IsFaulted()
|
||||
{
|
||||
// Subscription-only driver with live MonitoredItems but no publish progress within threshold
|
||||
// is a real wedge — this is the case the previous "no successful Read" formulation used
|
||||
// to miss (no reads ever happen).
|
||||
var detector = new WedgeDetector(Threshold);
|
||||
var demand = new DemandSignal(BulkheadDepth: 0, ActiveMonitoredItems: 5, QueuedHistoryReads: 0, LastProgressUtc: Now.AddMinutes(-10));
|
||||
|
||||
detector.Classify(DriverState.Healthy, demand, Now).ShouldBe(WedgeVerdict.Faulted);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void MonitoredItems_Active_WithFreshPublish_StaysHealthy()
|
||||
{
|
||||
var detector = new WedgeDetector(Threshold);
|
||||
var demand = new DemandSignal(BulkheadDepth: 0, ActiveMonitoredItems: 5, QueuedHistoryReads: 0, LastProgressUtc: Now.AddSeconds(-10));
|
||||
|
||||
detector.Classify(DriverState.Healthy, demand, Now).ShouldBe(WedgeVerdict.Healthy);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void HistoryBackfill_SlowButMakingProgress_StaysHealthy()
|
||||
{
|
||||
// Slow historian backfill — QueuedHistoryReads > 0 but progress advances within threshold.
|
||||
var detector = new WedgeDetector(Threshold);
|
||||
var demand = new DemandSignal(BulkheadDepth: 0, ActiveMonitoredItems: 0, QueuedHistoryReads: 50, LastProgressUtc: Now.AddSeconds(-60));
|
||||
|
||||
detector.Classify(DriverState.Healthy, demand, Now).ShouldBe(WedgeVerdict.Healthy);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void WriteOnlyBurst_StaysIdle_WhenBulkheadEmpty()
|
||||
{
|
||||
// A write-only driver that just finished a burst: bulkhead drained, no subscriptions, no
|
||||
// history reads. Idle — the previous formulation would have faulted here because no
|
||||
// reads were succeeding even though the driver is perfectly healthy.
|
||||
var detector = new WedgeDetector(Threshold);
|
||||
var demand = new DemandSignal(0, 0, 0, Now.AddMinutes(-30));
|
||||
|
||||
detector.Classify(DriverState.Healthy, demand, Now).ShouldBe(WedgeVerdict.Idle);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void DemandSignal_HasPendingWork_TrueForAnyNonZeroCounter()
|
||||
{
|
||||
new DemandSignal(1, 0, 0, Now).HasPendingWork.ShouldBeTrue();
|
||||
new DemandSignal(0, 1, 0, Now).HasPendingWork.ShouldBeTrue();
|
||||
new DemandSignal(0, 0, 1, Now).HasPendingWork.ShouldBeTrue();
|
||||
new DemandSignal(0, 0, 0, Now).HasPendingWork.ShouldBeFalse();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,127 @@
|
||||
using System.Linq;
|
||||
using System.Threading;
|
||||
using System.Threading.Tasks;
|
||||
using Shouldly;
|
||||
using Xunit;
|
||||
using Xunit.Abstractions;
|
||||
using ZB.MOM.WW.OtOpcUa.Driver.Galaxy.TestSupport;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Host.Tests
|
||||
{
|
||||
/// <summary>
|
||||
/// Exercises <see cref="AvevaPrerequisites"/> against the live dev box so the helper
|
||||
/// itself gets integration coverage — i.e. "do the probes return Pass for things that
|
||||
/// really are Pass?" as validated against this machine's known-installed topology.
|
||||
/// Category <c>LiveGalaxy</c> so CI / clean dev boxes skip cleanly.
|
||||
/// </summary>
|
||||
[Trait("Category", "LiveGalaxy")]
|
||||
public sealed class AvevaPrerequisitesLiveTests
|
||||
{
|
||||
private readonly ITestOutputHelper _output;
|
||||
|
||||
public AvevaPrerequisitesLiveTests(ITestOutputHelper output) => _output = output;
|
||||
|
||||
[Fact]
|
||||
public async Task CheckAll_on_live_box_reports_Framework_install()
|
||||
{
|
||||
var report = await AvevaPrerequisites.CheckAllAsync();
|
||||
_output.WriteLine(report.ToString());
|
||||
report.Checks.ShouldContain(c =>
|
||||
c.Name == "registry:ArchestrA.Framework" && c.Status == PrerequisiteStatus.Pass,
|
||||
"ArchestrA Framework registry root should be found on this machine.");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task CheckAll_on_live_box_reports_aaBootstrap_running()
|
||||
{
|
||||
var report = await AvevaPrerequisites.CheckAllAsync();
|
||||
var bootstrap = report.Checks.FirstOrDefault(c => c.Name == "service:aaBootstrap");
|
||||
bootstrap.ShouldNotBeNull();
|
||||
bootstrap.Status.ShouldBe(PrerequisiteStatus.Pass,
|
||||
$"aaBootstrap must be Running for any live-Galaxy test to work — detail: {bootstrap.Detail}");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task CheckAll_on_live_box_reports_aaGR_running()
|
||||
{
|
||||
var report = await AvevaPrerequisites.CheckAllAsync();
|
||||
var gr = report.Checks.FirstOrDefault(c => c.Name == "service:aaGR");
|
||||
gr.ShouldNotBeNull();
|
||||
gr.Status.ShouldBe(PrerequisiteStatus.Pass,
|
||||
$"aaGR (Galaxy Repository) must be Running — detail: {gr.Detail}");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task CheckAll_on_live_box_reports_MxAccess_COM_registered()
|
||||
{
|
||||
var report = await AvevaPrerequisites.CheckAllAsync();
|
||||
var com = report.Checks.FirstOrDefault(c => c.Name == "com:LMXProxy");
|
||||
com.ShouldNotBeNull();
|
||||
com.Status.ShouldBe(PrerequisiteStatus.Pass,
|
||||
$"LMXProxy.LMXProxyServer ProgID must resolve to an InprocServer32 DLL — detail: {com.Detail}");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task CheckRepositoryOnly_on_live_box_reports_ZB_reachable()
|
||||
{
|
||||
var report = await AvevaPrerequisites.CheckRepositoryOnlyAsync(ct: CancellationToken.None);
|
||||
var zb = report.Checks.FirstOrDefault(c => c.Name == "sql:ZB");
|
||||
zb.ShouldNotBeNull();
|
||||
zb.Status.ShouldBe(PrerequisiteStatus.Pass,
|
||||
$"ZB database must be reachable via SQL Server Windows auth — detail: {zb.Detail}");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task CheckRepositoryOnly_on_live_box_reports_non_zero_deployed_objects()
|
||||
{
|
||||
// This box has 49 deployed objects per the research; we just assert > 0 so adding/
|
||||
// removing objects doesn't break the test.
|
||||
var report = await AvevaPrerequisites.CheckRepositoryOnlyAsync();
|
||||
var deployed = report.Checks.FirstOrDefault(c => c.Name == "sql:ZB.deployedObjects");
|
||||
deployed.ShouldNotBeNull();
|
||||
deployed.Status.ShouldBe(PrerequisiteStatus.Pass,
|
||||
$"At least one deployed gobject should exist — detail: {deployed.Detail}");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Aveva_side_is_ready_on_this_machine()
|
||||
{
|
||||
// Narrower than "livetest ready" — our own services (OtOpcUa / OtOpcUaGalaxyHost)
|
||||
// may not be installed on a developer's box while they're actively iterating on
|
||||
// them, but the AVEVA side (Framework / Galaxy Repository / MXAccess COM /
|
||||
// SQL / core services) should always be up on a machine with System Platform
|
||||
// installed. This assertion is what gates live-Galaxy tests that go straight to
|
||||
// the Galaxy Repository without routing through our stack.
|
||||
var report = await AvevaPrerequisites.CheckAllAsync(
|
||||
new AvevaPrerequisites.Options { CheckGalaxyHostPipe = false });
|
||||
_output.WriteLine(report.ToString());
|
||||
_output.WriteLine(report.Warnings ?? "no warnings");
|
||||
|
||||
// Enumerate AVEVA-side failures (if any) for an actionable assertion message.
|
||||
var avevaFails = report.Checks
|
||||
.Where(c => c.Status == PrerequisiteStatus.Fail &&
|
||||
c.Category != PrerequisiteCategory.OtOpcUaService)
|
||||
.ToList();
|
||||
report.IsAvevaSideReady.ShouldBeTrue(
|
||||
avevaFails.Count == 0
|
||||
? "unexpected state"
|
||||
: "AVEVA-side failures: " + string.Join(" ; ",
|
||||
avevaFails.Select(f => $"{f.Name}: {f.Detail}")));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Report_captures_OtOpcUa_services_state_even_when_not_installed()
|
||||
{
|
||||
// The helper reports the status of OtOpcUaGalaxyHost + OtOpcUa services even if
|
||||
// they're not installed yet — absence is itself an actionable signal. This test
|
||||
// doesn't assert Pass/Fail on those services (their state depends on what's
|
||||
// installed when the test runs) — it only asserts the helper EMITTED the rows,
|
||||
// so nobody can ship a prerequisite check that silently omits our own services.
|
||||
var report = await AvevaPrerequisites.CheckAllAsync();
|
||||
|
||||
report.Checks.ShouldContain(c => c.Name == "service:OtOpcUaGalaxyHost");
|
||||
report.Checks.ShouldContain(c => c.Name == "service:OtOpcUa");
|
||||
report.Checks.ShouldContain(c => c.Name == "service:GLAuth");
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -6,6 +6,7 @@ using Xunit;
|
||||
using ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Host.Backend;
|
||||
using ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Host.Backend.Galaxy;
|
||||
using ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Shared.Contracts;
|
||||
using ZB.MOM.WW.OtOpcUa.Driver.Galaxy.TestSupport;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Host.Tests
|
||||
{
|
||||
@@ -16,6 +17,11 @@ namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Host.Tests
|
||||
/// SQL the v1 Host uses, proving the lift is byte-for-byte equivalent at the
|
||||
/// <c>DiscoverHierarchyResponse</c> shape.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// Since PR 36, skip logic is delegated to <see cref="AvevaPrerequisites.CheckRepositoryOnlyAsync"/>
|
||||
/// so operators see exactly why a test skipped ("ZB db not found" vs "SQL Server
|
||||
/// unreachable") instead of a silent return.
|
||||
/// </remarks>
|
||||
[Trait("Category", "LiveGalaxy")]
|
||||
public sealed class GalaxyRepositoryLiveSmokeTests
|
||||
{
|
||||
@@ -26,15 +32,20 @@ namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Host.Tests
|
||||
CommandTimeoutSeconds = 10,
|
||||
};
|
||||
|
||||
private static async Task<string?> RepositorySkipReasonAsync()
|
||||
{
|
||||
using var cts = new CancellationTokenSource(TimeSpan.FromSeconds(4));
|
||||
var report = await AvevaPrerequisites.CheckRepositoryOnlyAsync(
|
||||
DevZbOptions().ConnectionString, cts.Token);
|
||||
return report.SkipReason;
|
||||
}
|
||||
|
||||
private static async Task<bool> ZbReachableAsync()
|
||||
{
|
||||
try
|
||||
{
|
||||
var repo = new GalaxyRepository(DevZbOptions());
|
||||
using var cts = new CancellationTokenSource(TimeSpan.FromSeconds(3));
|
||||
return await repo.TestConnectionAsync(cts.Token);
|
||||
}
|
||||
catch { return false; }
|
||||
// Legacy silent-skip adapter — keeps the existing tests compiling while
|
||||
// gradually migrating to the Skip-with-reason pattern. Returns true when the
|
||||
// prerequisite check has no Fail entries.
|
||||
return (await RepositorySkipReasonAsync()) is null;
|
||||
}
|
||||
|
||||
[Fact]
|
||||
|
||||
@@ -23,6 +23,7 @@
|
||||
|
||||
<ItemGroup>
|
||||
<ProjectReference Include="..\..\src\ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Host\ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Host.csproj"/>
|
||||
<ProjectReference Include="..\ZB.MOM.WW.OtOpcUa.Driver.Galaxy.TestSupport\ZB.MOM.WW.OtOpcUa.Driver.Galaxy.TestSupport.csproj"/>
|
||||
<Reference Include="System.ServiceProcess"/>
|
||||
<!-- IMxProxy's delegate signatures mention ArchestrA.MxAccess.MXSTATUS_PROXY, so tests
|
||||
implementing the interface must resolve that type at compile time. -->
|
||||
|
||||
@@ -0,0 +1,81 @@
|
||||
using Shouldly;
|
||||
using Xunit;
|
||||
using ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Proxy;
|
||||
using ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Shared.Contracts;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Proxy.Tests;
|
||||
|
||||
/// <summary>
|
||||
/// Pins <see cref="GalaxyProxyDriver.ToHistoricalEvent"/> — the wire-to-domain mapping
|
||||
/// from <see cref="GalaxyHistoricalEvent"/> (MessagePack-annotated IPC contract,
|
||||
/// Unix-ms timestamps) to <c>Core.Abstractions.HistoricalEvent</c> (domain record,
|
||||
/// <see cref="DateTime"/> timestamps). Added in PR 35 alongside the new
|
||||
/// <c>IHistoryProvider.ReadEventsAsync</c> method.
|
||||
/// </summary>
|
||||
[Trait("Category", "Unit")]
|
||||
public sealed class HistoricalEventMappingTests
|
||||
{
|
||||
[Fact]
|
||||
public void Maps_every_field_from_wire_to_domain_record()
|
||||
{
|
||||
var wire = new GalaxyHistoricalEvent
|
||||
{
|
||||
EventId = "evt-42",
|
||||
SourceName = "Tank1.HiAlarm",
|
||||
EventTimeUtcUnixMs = 1_700_000_000_000L, // 2023-11-14T22:13:20.000Z
|
||||
ReceivedTimeUtcUnixMs = 1_700_000_000_500L,
|
||||
DisplayText = "High level reached",
|
||||
Severity = 750,
|
||||
};
|
||||
|
||||
var domain = GalaxyProxyDriver.ToHistoricalEvent(wire);
|
||||
|
||||
domain.EventId.ShouldBe("evt-42");
|
||||
domain.SourceName.ShouldBe("Tank1.HiAlarm");
|
||||
domain.EventTimeUtc.ShouldBe(new DateTime(2023, 11, 14, 22, 13, 20, DateTimeKind.Utc));
|
||||
domain.ReceivedTimeUtc.ShouldBe(new DateTime(2023, 11, 14, 22, 13, 20, 500, DateTimeKind.Utc));
|
||||
domain.Message.ShouldBe("High level reached");
|
||||
domain.Severity.ShouldBe((ushort)750);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Preserves_null_SourceName_and_DisplayText()
|
||||
{
|
||||
// Historical rows from the Galaxy event historian often omit source or message for
|
||||
// system events (e.g. time sync). The mapping must preserve null — callers use it to
|
||||
// distinguish system events from alarm events.
|
||||
var wire = new GalaxyHistoricalEvent
|
||||
{
|
||||
EventId = "sys-1",
|
||||
SourceName = null,
|
||||
EventTimeUtcUnixMs = 0,
|
||||
ReceivedTimeUtcUnixMs = 0,
|
||||
DisplayText = null,
|
||||
Severity = 1,
|
||||
};
|
||||
|
||||
var domain = GalaxyProxyDriver.ToHistoricalEvent(wire);
|
||||
|
||||
domain.SourceName.ShouldBeNull();
|
||||
domain.Message.ShouldBeNull();
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void EventTime_and_ReceivedTime_are_produced_as_DateTimeKind_Utc()
|
||||
{
|
||||
// Unix-ms timestamps come off the wire timezone-agnostic; the mapping must tag the
|
||||
// resulting DateTime as Utc so downstream serializers (JSON, OPC UA types) don't apply
|
||||
// an unexpected local-time offset.
|
||||
var wire = new GalaxyHistoricalEvent
|
||||
{
|
||||
EventId = "e",
|
||||
EventTimeUtcUnixMs = 1_000L,
|
||||
ReceivedTimeUtcUnixMs = 2_000L,
|
||||
};
|
||||
|
||||
var domain = GalaxyProxyDriver.ToHistoricalEvent(wire);
|
||||
|
||||
domain.EventTimeUtc.Kind.ShouldBe(DateTimeKind.Utc);
|
||||
domain.ReceivedTimeUtc.Kind.ShouldBe(DateTimeKind.Utc);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,75 @@
|
||||
using System.Runtime.InteropServices;
|
||||
using System.Runtime.Versioning;
|
||||
using Microsoft.Win32;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Proxy.Tests.LiveStack;
|
||||
|
||||
/// <summary>
|
||||
/// Resolves the pipe name + shared secret the live <see cref="GalaxyProxyDriver"/> needs
|
||||
/// to connect to a running <c>OtOpcUaGalaxyHost</c> Windows service. Two sources are
|
||||
/// consulted, first match wins:
|
||||
/// <list type="number">
|
||||
/// <item>Explicit env vars (<c>OTOPCUA_GALAXY_PIPE</c>, <c>OTOPCUA_GALAXY_SECRET</c>) — lets CI / benchwork override.</item>
|
||||
/// <item>The service's per-process <c>Environment</c> registry values under
|
||||
/// <c>HKLM\SYSTEM\CurrentControlSet\Services\OtOpcUaGalaxyHost</c> — what
|
||||
/// <c>Install-Services.ps1</c> writes at install time. Requires the test to run as a
|
||||
/// principal with read access to that registry key (typically Administrators).</item>
|
||||
/// </list>
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// Explicitly NOT baked-in-to-source: the shared secret is rotated per install (the
|
||||
/// installer generates 32 random bytes and stores the base64 string). A hard-coded secret
|
||||
/// in tests would diverge from production the moment someone re-installed the service.
|
||||
/// </remarks>
|
||||
public sealed record LiveStackConfig(string PipeName, string SharedSecret, string? Source)
|
||||
{
|
||||
public const string EnvPipeName = "OTOPCUA_GALAXY_PIPE";
|
||||
public const string EnvSharedSecret = "OTOPCUA_GALAXY_SECRET";
|
||||
public const string ServiceRegistryKey =
|
||||
@"SYSTEM\CurrentControlSet\Services\OtOpcUaGalaxyHost";
|
||||
public const string DefaultPipeName = "OtOpcUaGalaxy";
|
||||
|
||||
public static LiveStackConfig? Resolve()
|
||||
{
|
||||
var envPipe = Environment.GetEnvironmentVariable(EnvPipeName);
|
||||
var envSecret = Environment.GetEnvironmentVariable(EnvSharedSecret);
|
||||
if (!string.IsNullOrWhiteSpace(envPipe) && !string.IsNullOrWhiteSpace(envSecret))
|
||||
return new LiveStackConfig(envPipe, envSecret, "env vars");
|
||||
|
||||
if (!RuntimeInformation.IsOSPlatform(OSPlatform.Windows))
|
||||
return null;
|
||||
|
||||
return FromServiceRegistry();
|
||||
}
|
||||
|
||||
[SupportedOSPlatform("windows")]
|
||||
private static LiveStackConfig? FromServiceRegistry()
|
||||
{
|
||||
try
|
||||
{
|
||||
using var key = Registry.LocalMachine.OpenSubKey(ServiceRegistryKey);
|
||||
if (key is null) return null;
|
||||
var env = key.GetValue("Environment") as string[];
|
||||
if (env is null || env.Length == 0) return null;
|
||||
|
||||
string? pipe = null, secret = null;
|
||||
foreach (var line in env)
|
||||
{
|
||||
var eq = line.IndexOf('=');
|
||||
if (eq <= 0) continue;
|
||||
var name = line[..eq];
|
||||
var value = line[(eq + 1)..];
|
||||
if (name.Equals(EnvPipeName, StringComparison.OrdinalIgnoreCase)) pipe = value;
|
||||
else if (name.Equals(EnvSharedSecret, StringComparison.OrdinalIgnoreCase)) secret = value;
|
||||
}
|
||||
|
||||
if (string.IsNullOrWhiteSpace(secret)) return null;
|
||||
return new LiveStackConfig(pipe ?? DefaultPipeName, secret, "service registry");
|
||||
}
|
||||
catch
|
||||
{
|
||||
// Access denied / key missing / malformed — caller gets null and surfaces a Skip.
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,164 @@
|
||||
using System.Runtime.InteropServices;
|
||||
using System.Runtime.Versioning;
|
||||
using System.Security.Principal;
|
||||
using System.Threading;
|
||||
using System.Threading.Tasks;
|
||||
using Xunit;
|
||||
using ZB.MOM.WW.OtOpcUa.Driver.Galaxy.TestSupport;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Proxy.Tests.LiveStack;
|
||||
|
||||
/// <summary>
|
||||
/// Connects a single <see cref="GalaxyProxyDriver"/> to the already-running
|
||||
/// <c>OtOpcUaGalaxyHost</c> Windows service for the lifetime of a test class. Uses
|
||||
/// <see cref="AvevaPrerequisites"/> to decide whether to proceed; on failure,
|
||||
/// <see cref="SkipReason"/> is populated and each test calls <see cref="SkipIfUnavailable"/>
|
||||
/// to translate that into <c>Assert.Skip</c>.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// <b>Does NOT spawn the Host process.</b> Production deploys <c>OtOpcUaGalaxyHost</c>
|
||||
/// as a standalone Windows service — spawning a second instance from a test would
|
||||
/// bypass the COM-apartment + service-account setup and fail differently than
|
||||
/// production (see <c>project_galaxy_host_service.md</c> memory).
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// <b>Shared-secret handling</b>: read from <see cref="LiveStackConfig"/> — env vars
|
||||
/// first, then the service's registry-stored <c>Environment</c> values. Requires
|
||||
/// the test process to have read access to
|
||||
/// <c>HKLM\SYSTEM\CurrentControlSet\Services\OtOpcUaGalaxyHost</c>; on a dev box
|
||||
/// that typically means running the test host elevated, or exporting
|
||||
/// <c>OTOPCUA_GALAXY_SECRET</c> out-of-band.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public sealed class LiveStackFixture : IAsyncLifetime
|
||||
{
|
||||
public GalaxyProxyDriver? Driver { get; private set; }
|
||||
|
||||
public string? SkipReason { get; private set; }
|
||||
|
||||
public PrerequisiteReport? PrerequisiteReport { get; private set; }
|
||||
|
||||
public LiveStackConfig? Config { get; private set; }
|
||||
|
||||
public async ValueTask InitializeAsync()
|
||||
{
|
||||
// 0. Elevated-shell short-circuit. The OtOpcUaGalaxyHost pipe ACL allows the configured
|
||||
// SID but explicitly DENIES Administrators (decision #76 — production hardening).
|
||||
// A test process running with a high-integrity token (any elevated shell) carries the
|
||||
// Admins group in its security context, so the deny rule trumps the user's allow and
|
||||
// the pipe connect returns UnauthorizedAccessException — technically correct but
|
||||
// the operationally confusing failure mode that ate most of the PR 37 install
|
||||
// debugging session. Surfacing it explicitly here saves the next operator the same
|
||||
// five-step diagnosis. ParityFixture has the same skip with the same rationale.
|
||||
if (IsElevatedAdministratorOnWindows())
|
||||
{
|
||||
SkipReason =
|
||||
"Test host is running with elevated (Administrators) privileges, but the " +
|
||||
"OtOpcUaGalaxyHost named-pipe ACL explicitly denies Administrators per the IPC " +
|
||||
"security design (decision #76 / PipeAcl.cs). Re-run from a NORMAL (non-admin) " +
|
||||
"PowerShell window — even when your user is already in the pipe's allow list, " +
|
||||
"the elevated token's Admins group membership trumps the allow rule.";
|
||||
return;
|
||||
}
|
||||
|
||||
// 1. AVEVA + OtOpcUa service state — actionable diagnostic if anything is missing.
|
||||
using var cts = new CancellationTokenSource(TimeSpan.FromSeconds(10));
|
||||
PrerequisiteReport = await AvevaPrerequisites.CheckAllAsync(
|
||||
new AvevaPrerequisites.Options { CheckGalaxyHostPipe = true, CheckHistorian = false },
|
||||
cts.Token);
|
||||
|
||||
if (!PrerequisiteReport.IsLivetestReady)
|
||||
{
|
||||
SkipReason = PrerequisiteReport.SkipReason;
|
||||
return;
|
||||
}
|
||||
|
||||
// 2. Secret / pipe-name resolution. If the service is running but we can't discover its
|
||||
// env vars from registry (non-elevated test host), a clear message beats a silent
|
||||
// connect-rejected failure 10 seconds later.
|
||||
Config = LiveStackConfig.Resolve();
|
||||
if (Config is null)
|
||||
{
|
||||
SkipReason =
|
||||
$"Cannot resolve shared secret. Set {LiveStackConfig.EnvSharedSecret} (and optionally " +
|
||||
$"{LiveStackConfig.EnvPipeName}) in the environment, or run the test host elevated so it " +
|
||||
$"can read HKLM\\{LiveStackConfig.ServiceRegistryKey}\\Environment.";
|
||||
return;
|
||||
}
|
||||
|
||||
// 3. Connect. InitializeAsync does the pipe connect + handshake; a 5-second
|
||||
// ConnectTimeout gives enough headroom for a service that just started.
|
||||
Driver = new GalaxyProxyDriver(new GalaxyProxyOptions
|
||||
{
|
||||
DriverInstanceId = "live-stack-smoke",
|
||||
PipeName = Config.PipeName,
|
||||
SharedSecret = Config.SharedSecret,
|
||||
ConnectTimeout = TimeSpan.FromSeconds(5),
|
||||
});
|
||||
|
||||
try
|
||||
{
|
||||
await Driver.InitializeAsync(driverConfigJson: "{}", CancellationToken.None);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
SkipReason =
|
||||
$"Connected to named pipe '{Config.PipeName}' but GalaxyProxyDriver.InitializeAsync failed: " +
|
||||
$"{ex.GetType().Name}: {ex.Message}. Common causes: shared secret mismatch (rotated after last install), " +
|
||||
$"service account SID not in pipe ACL (installer sets OTOPCUA_ALLOWED_SID to the service account — " +
|
||||
$"test must run as that user), or Host's backend couldn't connect to ZB.";
|
||||
Driver.Dispose();
|
||||
Driver = null;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
public async ValueTask DisposeAsync()
|
||||
{
|
||||
if (Driver is not null)
|
||||
{
|
||||
try { await Driver.ShutdownAsync(CancellationToken.None); } catch { /* best-effort */ }
|
||||
Driver.Dispose();
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Translate <see cref="SkipReason"/> into <c>Assert.Skip</c>. Tests call this at the
|
||||
/// top of every fact so a fixture init failure shows up as a cleanly-skipped test with
|
||||
/// the full prerequisites report, not a cascading NullReferenceException on
|
||||
/// <see cref="Driver"/>.
|
||||
/// </summary>
|
||||
public void SkipIfUnavailable()
|
||||
{
|
||||
if (SkipReason is not null) Assert.Skip(SkipReason);
|
||||
}
|
||||
|
||||
private static bool IsElevatedAdministratorOnWindows()
|
||||
{
|
||||
if (!RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) return false;
|
||||
return CheckWindowsAdminToken();
|
||||
}
|
||||
|
||||
[SupportedOSPlatform("windows")]
|
||||
private static bool CheckWindowsAdminToken()
|
||||
{
|
||||
try
|
||||
{
|
||||
using var identity = WindowsIdentity.GetCurrent();
|
||||
return new WindowsPrincipal(identity).IsInRole(WindowsBuiltInRole.Administrator);
|
||||
}
|
||||
catch
|
||||
{
|
||||
// Probe shouldn't crash the test; if we can't determine elevation, optimistically
|
||||
// continue and let the actual pipe connect surface its own error.
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
[CollectionDefinition(Name)]
|
||||
public sealed class LiveStackCollection : ICollectionFixture<LiveStackFixture>
|
||||
{
|
||||
public const string Name = "LiveStack";
|
||||
}
|
||||
@@ -0,0 +1,282 @@
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using System.Threading;
|
||||
using System.Threading.Tasks;
|
||||
using Shouldly;
|
||||
using Xunit;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Proxy.Tests.LiveStack;
|
||||
|
||||
/// <summary>
|
||||
/// End-to-end smoke against the installed <c>OtOpcUaGalaxyHost</c> Windows service.
|
||||
/// Closes LMX follow-up #5 — exercises the full topology: <see cref="GalaxyProxyDriver"/>
|
||||
/// in-process → named-pipe IPC → <c>OtOpcUaGalaxyHost</c> service → <c>MxAccessGalaxyBackend</c> →
|
||||
/// live MXAccess runtime → real Galaxy objects + attributes.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// <b>Preconditions</b> (all checked by <see cref="LiveStackFixture"/>, surfaced via
|
||||
/// <c>Assert.Skip</c> when missing):
|
||||
/// </para>
|
||||
/// <list type="bullet">
|
||||
/// <item>AVEVA System Platform installed + Platform deployed.</item>
|
||||
/// <item><c>aaBootstrap</c> / <c>aaGR</c> / <c>NmxSvc</c> / <c>MSSQLSERVER</c> running.</item>
|
||||
/// <item>MXAccess COM server registered.</item>
|
||||
/// <item>ZB database exists with at least one deployed gobject.</item>
|
||||
/// <item><c>OtOpcUaGalaxyHost</c> service installed + running (named pipe accepting connections).</item>
|
||||
/// <item>Shared secret discoverable via <c>OTOPCUA_GALAXY_SECRET</c> env var or the
|
||||
/// service's registry Environment values (test host typically needs to be elevated
|
||||
/// to read the latter).</item>
|
||||
/// <item>Test process runs as the account listed in the service's pipe ACL
|
||||
/// (<c>OTOPCUA_ALLOWED_SID</c>, typically the service account per decision #76).</item>
|
||||
/// </list>
|
||||
/// <para>
|
||||
/// Tests here are deliberately read-only. Writes against live Galaxy attributes are a
|
||||
/// separate concern — they need a test-only UDA or an agreed scratch tag so they can't
|
||||
/// accidentally mutate a process-critical value. Adding a write test is a follow-up
|
||||
/// PR that reuses this fixture.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
[Trait("Category", "LiveGalaxy")]
|
||||
[Collection(LiveStackCollection.Name)]
|
||||
public sealed class LiveStackSmokeTests(LiveStackFixture fixture)
|
||||
{
|
||||
[Fact]
|
||||
public void Fixture_initialized_successfully()
|
||||
{
|
||||
fixture.SkipIfUnavailable();
|
||||
// If the fixture init succeeded, Driver is non-null and InitializeAsync completed.
|
||||
// This is the cheapest possible assertion that the IPC handshake worked end-to-end;
|
||||
// every other test in this class depends on it.
|
||||
fixture.Driver.ShouldNotBeNull();
|
||||
fixture.Config.ShouldNotBeNull();
|
||||
fixture.PrerequisiteReport.ShouldNotBeNull();
|
||||
fixture.PrerequisiteReport!.IsLivetestReady.ShouldBeTrue(fixture.PrerequisiteReport.SkipReason);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Driver_reports_Healthy_after_IPC_handshake()
|
||||
{
|
||||
fixture.SkipIfUnavailable();
|
||||
var health = fixture.Driver!.GetHealth();
|
||||
health.State.ShouldBe(DriverState.Healthy,
|
||||
$"Expected Healthy after successful IPC connect; Reason={health.LastError}");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task DiscoverAsync_returns_at_least_one_variable_from_live_galaxy()
|
||||
{
|
||||
fixture.SkipIfUnavailable();
|
||||
var builder = new CapturingAddressSpaceBuilder();
|
||||
using var cts = new CancellationTokenSource(TimeSpan.FromSeconds(30));
|
||||
await fixture.Driver!.DiscoverAsync(builder, cts.Token);
|
||||
|
||||
builder.Variables.Count.ShouldBeGreaterThan(0,
|
||||
"Live Galaxy has > 0 deployed objects per the prereq check — at least one variable must be discovered. " +
|
||||
"Zero usually means the Host couldn't read ZB (check OTOPCUA_GALAXY_ZB_CONN in the service Environment).");
|
||||
|
||||
// Every discovered attribute must carry a non-empty FullName so the OPC UA server can
|
||||
// route reads/writes back. Regression guard — PR 19 normalized this across drivers.
|
||||
builder.Variables.ShouldAllBe(v => !string.IsNullOrEmpty(v.AttributeInfo.FullName));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void GetHostStatuses_reports_at_least_one_platform()
|
||||
{
|
||||
fixture.SkipIfUnavailable();
|
||||
var statuses = fixture.Driver!.GetHostStatuses();
|
||||
statuses.Count.ShouldBeGreaterThan(0,
|
||||
"Live Galaxy must report at least one Platform/AppEngine host via IHostConnectivityProbe. " +
|
||||
"Zero means the Host's probe loop hasn't completed its first tick or the Platform isn't deployed locally.");
|
||||
|
||||
// Host names are driver-opaque to the Core but non-empty by contract.
|
||||
statuses.ShouldAllBe(h => !string.IsNullOrEmpty(h.HostName));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Can_read_a_discovered_variable_from_live_galaxy()
|
||||
{
|
||||
fixture.SkipIfUnavailable();
|
||||
var builder = new CapturingAddressSpaceBuilder();
|
||||
using var cts = new CancellationTokenSource(TimeSpan.FromSeconds(30));
|
||||
await fixture.Driver!.DiscoverAsync(builder, cts.Token);
|
||||
builder.Variables.Count.ShouldBeGreaterThan(0);
|
||||
|
||||
// Pick the first discovered variable. Read-only smoke — we don't assert on Value,
|
||||
// only that a ReadAsync round-trip through Proxy → Host pipe → MXAccess → back
|
||||
// returns a snapshot with a non-BadInternalError status. Galaxy attributes default to
|
||||
// Uncertain quality until the Engine's first scan publishes them, which is fine here.
|
||||
var full = builder.Variables[0].AttributeInfo.FullName;
|
||||
var snapshots = await fixture.Driver!.ReadAsync([full], cts.Token);
|
||||
|
||||
snapshots.Count.ShouldBe(1);
|
||||
var snap = snapshots[0];
|
||||
snap.StatusCode.ShouldNotBe(0x80020000u,
|
||||
$"Read returned BadInternalError for {full} — the Host couldn't fulfil the request. " +
|
||||
$"Investigate: the Host service's logs at {System.Environment.GetFolderPath(System.Environment.SpecialFolder.CommonApplicationData)}\\OtOpcUa\\Galaxy\\logs.");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Write_then_read_roundtrips_a_writable_Boolean_attribute_on_TestMachine_001()
|
||||
{
|
||||
// PR 40 — finishes LMX #5. Targets DelmiaReceiver_001.TestAttribute, the writable
|
||||
// Boolean attribute on the TestMachine_001 hierarchy that the dev Galaxy was deployed
|
||||
// with for exactly this kind of integration testing. We invert the current value and
|
||||
// assert the new value comes back, then restore the original so the test is effectively
|
||||
// idempotent (Galaxy holds the value across runs since it's a deployed UDA).
|
||||
fixture.SkipIfUnavailable();
|
||||
const string fullRef = "DelmiaReceiver_001.TestAttribute";
|
||||
using var cts = new CancellationTokenSource(TimeSpan.FromSeconds(30));
|
||||
|
||||
// Read current value first — gives the cleanup path the right baseline. Galaxy may
|
||||
// return Uncertain quality until the Engine has scanned the attribute at least once;
|
||||
// we don't read into a strongly-typed bool until Status is Good.
|
||||
var before = (await fixture.Driver!.ReadAsync([fullRef], cts.Token))[0];
|
||||
before.StatusCode.ShouldNotBe(0x80020000u, $"baseline read failed for {fullRef}: {before.Value}");
|
||||
var originalBool = Convert.ToBoolean(before.Value ?? false);
|
||||
var inverted = !originalBool;
|
||||
|
||||
try
|
||||
{
|
||||
// Write the inverted value via IWritable.
|
||||
var writeResults = await fixture.Driver!.WriteAsync(
|
||||
[new(fullRef, inverted)], cts.Token);
|
||||
writeResults.Count.ShouldBe(1);
|
||||
writeResults[0].StatusCode.ShouldBe(0u,
|
||||
$"WriteAsync returned status 0x{writeResults[0].StatusCode:X8} for {fullRef} — " +
|
||||
$"check the Host service log at %ProgramData%\\OtOpcUa\\Galaxy\\.");
|
||||
|
||||
// The Engine's scan + acknowledgement is async — read in a short loop with a 5s
|
||||
// budget. Galaxy's attribute roundtrip on a dev box is typically sub-second but
|
||||
// we give headroom for first-scan after a service restart.
|
||||
DataValueSnapshot after = default!;
|
||||
var deadline = DateTime.UtcNow.AddSeconds(5);
|
||||
while (DateTime.UtcNow < deadline)
|
||||
{
|
||||
after = (await fixture.Driver!.ReadAsync([fullRef], cts.Token))[0];
|
||||
if (after.StatusCode == 0u && Convert.ToBoolean(after.Value ?? false) == inverted) break;
|
||||
await Task.Delay(200, cts.Token);
|
||||
}
|
||||
after.StatusCode.ShouldBe(0u, "post-write read failed");
|
||||
Convert.ToBoolean(after.Value ?? false).ShouldBe(inverted,
|
||||
$"Wrote {inverted} but Galaxy returned {after.Value} after the scan window.");
|
||||
}
|
||||
finally
|
||||
{
|
||||
// Restore — best-effort. If this throws the test still reports its primary result;
|
||||
// we just leave a flipped TestAttribute on the dev box (benign, name says it all).
|
||||
try { await fixture.Driver!.WriteAsync([new(fullRef, originalBool)], cts.Token); }
|
||||
catch { /* swallow */ }
|
||||
}
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Subscribe_fires_OnDataChange_with_initial_value_then_again_after_a_write()
|
||||
{
|
||||
// Subscribe + write is the canonical "is the data path actually live" test for
|
||||
// an OPC UA driver. We subscribe to the same Boolean attribute, expect an initial-
|
||||
// value callback within a couple of seconds (per ISubscribable's contract — the
|
||||
// driver MAY fire OnDataChange immediately with the current value), then write a
|
||||
// distinct value and expect a second callback carrying the new value.
|
||||
fixture.SkipIfUnavailable();
|
||||
const string fullRef = "DelmiaReceiver_001.TestAttribute";
|
||||
using var cts = new CancellationTokenSource(TimeSpan.FromSeconds(30));
|
||||
|
||||
// Capture every OnDataChange notification for this fullRef onto a thread-safe queue
|
||||
// we can poll from the test thread. Galaxy's MXAccess advisory fires on its own
|
||||
// thread; we don't want to block it.
|
||||
var notifications = new System.Collections.Concurrent.ConcurrentQueue<DataValueSnapshot>();
|
||||
void Handler(object? sender, DataChangeEventArgs e)
|
||||
{
|
||||
if (string.Equals(e.FullReference, fullRef, StringComparison.OrdinalIgnoreCase))
|
||||
notifications.Enqueue(e.Snapshot);
|
||||
}
|
||||
fixture.Driver!.OnDataChange += Handler;
|
||||
|
||||
// Read current value so we know which value to write to force a transition.
|
||||
var before = (await fixture.Driver!.ReadAsync([fullRef], cts.Token))[0];
|
||||
var originalBool = Convert.ToBoolean(before.Value ?? false);
|
||||
var toWrite = !originalBool;
|
||||
|
||||
ISubscriptionHandle? handle = null;
|
||||
try
|
||||
{
|
||||
handle = await fixture.Driver!.SubscribeAsync(
|
||||
[fullRef], TimeSpan.FromMilliseconds(250), cts.Token);
|
||||
|
||||
// Wait for initial-value notification — typical < 1s on a hot Galaxy, give 5s.
|
||||
await WaitForAsync(() => notifications.Count >= 1, TimeSpan.FromSeconds(5), cts.Token);
|
||||
notifications.Count.ShouldBeGreaterThanOrEqualTo(1,
|
||||
$"No initial-value OnDataChange for {fullRef} within 5s. " +
|
||||
$"Either MXAccess subscription failed silently or the Engine hasn't scanned yet.");
|
||||
|
||||
// Drain the initial-value queue before writing so we count post-write deltas only.
|
||||
var initialCount = notifications.Count;
|
||||
|
||||
// Write the toggled value. Engine scan + advisory fires the second callback.
|
||||
var w = await fixture.Driver!.WriteAsync([new(fullRef, toWrite)], cts.Token);
|
||||
w[0].StatusCode.ShouldBe(0u);
|
||||
|
||||
await WaitForAsync(() => notifications.Count > initialCount, TimeSpan.FromSeconds(8), cts.Token);
|
||||
notifications.Count.ShouldBeGreaterThan(initialCount,
|
||||
$"OnDataChange did not fire after writing {toWrite} to {fullRef} within 8s.");
|
||||
|
||||
// Find the post-write notification carrying the toggled value (initial value may
|
||||
// appear multiple times before the write commits — search the tail).
|
||||
var postWrite = notifications.ToArray().Reverse()
|
||||
.FirstOrDefault(n => n.StatusCode == 0u && Convert.ToBoolean(n.Value ?? false) == toWrite);
|
||||
postWrite.ShouldNotBe(default,
|
||||
$"No OnDataChange carrying the toggled value {toWrite} appeared in the queue: " +
|
||||
string.Join(",", notifications.Select(n => $"{n.Value}@{n.StatusCode:X8}")));
|
||||
}
|
||||
finally
|
||||
{
|
||||
fixture.Driver!.OnDataChange -= Handler;
|
||||
if (handle is not null)
|
||||
{
|
||||
try { await fixture.Driver!.UnsubscribeAsync(handle, cts.Token); } catch { /* swallow */ }
|
||||
}
|
||||
// Restore baseline.
|
||||
try { await fixture.Driver!.WriteAsync([new(fullRef, originalBool)], cts.Token); } catch { /* swallow */ }
|
||||
}
|
||||
}
|
||||
|
||||
private static async Task WaitForAsync(Func<bool> predicate, TimeSpan budget, CancellationToken ct)
|
||||
{
|
||||
var deadline = DateTime.UtcNow + budget;
|
||||
while (DateTime.UtcNow < deadline)
|
||||
{
|
||||
if (predicate()) return;
|
||||
await Task.Delay(100, ct);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Minimal <see cref="IAddressSpaceBuilder"/> implementation that captures every
|
||||
/// Variable() call into a flat list so tests can inspect what discovery produced
|
||||
/// without running the full OPC UA node-manager stack.
|
||||
/// </summary>
|
||||
private sealed class CapturingAddressSpaceBuilder : IAddressSpaceBuilder
|
||||
{
|
||||
public List<(string BrowseName, DriverAttributeInfo AttributeInfo)> Variables { get; } = [];
|
||||
|
||||
public IAddressSpaceBuilder Folder(string browseName, string displayName) => this;
|
||||
public IVariableHandle Variable(string browseName, string displayName, DriverAttributeInfo attributeInfo)
|
||||
{
|
||||
Variables.Add((browseName, attributeInfo));
|
||||
return new NoopHandle(attributeInfo.FullName);
|
||||
}
|
||||
public void AddProperty(string browseName, DriverDataType dataType, object? value) { }
|
||||
|
||||
private sealed class NoopHandle(string fullReference) : IVariableHandle
|
||||
{
|
||||
public string FullReference { get; } = fullReference;
|
||||
public IAlarmConditionSink MarkAsAlarmCondition(AlarmConditionInfo info) => new NoopSink();
|
||||
private sealed class NoopSink : IAlarmConditionSink
|
||||
{
|
||||
public void OnTransition(AlarmEventArgs args) { }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -22,6 +22,7 @@
|
||||
<ItemGroup>
|
||||
<ProjectReference Include="..\..\src\ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Proxy\ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Proxy.csproj"/>
|
||||
<ProjectReference Include="..\..\src\ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Host\ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Host.csproj"/>
|
||||
<ProjectReference Include="..\ZB.MOM.WW.OtOpcUa.Driver.Galaxy.TestSupport\ZB.MOM.WW.OtOpcUa.Driver.Galaxy.TestSupport.csproj"/>
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
|
||||
@@ -0,0 +1,163 @@
|
||||
using ZB.MOM.WW.OtOpcUa.Driver.Galaxy.TestSupport.Probes;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy.TestSupport;
|
||||
|
||||
/// <summary>
|
||||
/// Entry point for live-AVEVA test fixtures. Runs every relevant probe and returns a
|
||||
/// <see cref="PrerequisiteReport"/> whose <c>SkipReason</c> feeds <c>Assert.Skip</c> when
|
||||
/// the environment isn't set up. Non-Windows hosts get a single aggregated Skip row per
|
||||
/// category instead of a flood of individual skips.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para><b>Call shape</b>:</para>
|
||||
/// <code>
|
||||
/// var report = await AvevaPrerequisites.CheckAllAsync();
|
||||
/// if (report.SkipReason is not null) Assert.Skip(report.SkipReason);
|
||||
/// </code>
|
||||
/// <para><b>Categories in rough order of 'would I want to know first?'</b>:</para>
|
||||
/// <list type="number">
|
||||
/// <item>Environment — process bitness, OS platform, RPCSS up.</item>
|
||||
/// <item>AvevaInstall — Framework registry, install paths, no pending reboot.</item>
|
||||
/// <item>AvevaCoreService — aaBootstrap / aaGR / NmxSvc running.</item>
|
||||
/// <item>MxAccessCom — LMXProxy.LMXProxyServer ProgID → CLSID → file-on-disk.</item>
|
||||
/// <item>GalaxyRepository — SQL reachable, ZB exists, deployed-object count.</item>
|
||||
/// <item>OtOpcUaService — our two Windows services + GLAuth.</item>
|
||||
/// <item>AvevaSoftService — aaLogger etc., warn only.</item>
|
||||
/// <item>AvevaHistorian — aahClientAccessPoint etc., optional.</item>
|
||||
/// </list>
|
||||
/// <para><b>What's NOT checked here</b>: end-to-end subscribe / read / write against a real
|
||||
/// Galaxy tag. That's the job of the live-smoke tests this helper gates — the helper just
|
||||
/// tells them whether running is worthwhile.</para>
|
||||
/// </remarks>
|
||||
public static class AvevaPrerequisites
|
||||
{
|
||||
// -------- Individual service lists (kept as data so tests can inspect / override) --------
|
||||
|
||||
/// <summary>Services whose absence means live-Galaxy tests can't run at all.</summary>
|
||||
internal static readonly (string Name, string Purpose)[] CoreServices =
|
||||
[
|
||||
("aaBootstrap", "master service that starts the Platform process + brokers aa* communication"),
|
||||
("aaGR", "Galaxy Repository host — mediates IDE / runtime access to ZB"),
|
||||
("NmxSvc", "Network Message Exchange — MXAccess + Bootstrap transport"),
|
||||
("MSSQLSERVER", "SQL Server instance that hosts the ZB database"),
|
||||
];
|
||||
|
||||
/// <summary>Warn-but-don't-fail AVEVA services.</summary>
|
||||
internal static readonly (string Name, string Purpose)[] SoftServices =
|
||||
[
|
||||
("aaLogger", "ArchestrA Logger — diagnostic log receiver; stack runs without it but error visibility suffers"),
|
||||
("aaUserValidator", "OS user/group auth for ArchestrA security; only required when Galaxy security mode isn't 'Open'"),
|
||||
("aaGlobalDataCacheMonitorSvr", "cross-platform global data cache; single-node dev boxes run fine without it"),
|
||||
];
|
||||
|
||||
/// <summary>Optional AVEVA Historian services — only required for HistoryRead IPC paths.</summary>
|
||||
internal static readonly (string Name, string Purpose)[] HistorianServices =
|
||||
[
|
||||
("aahClientAccessPoint", "AVEVA Historian Client Access Point — HistoryRead IPC endpoint"),
|
||||
("aahGateway", "AVEVA Historian Gateway"),
|
||||
];
|
||||
|
||||
/// <summary>OtOpcUa-stack Windows services + third-party deps we manage.</summary>
|
||||
internal static readonly (string Name, string Purpose, bool HardRequired)[] OtOpcUaServices =
|
||||
[
|
||||
("OtOpcUaGalaxyHost", "Galaxy.Host out-of-process service (net48 x86, STA + MXAccess)", true),
|
||||
("OtOpcUa", "Main OPC UA server service (hosts Proxy + DriverHost + Admin-facing DB publisher)", false),
|
||||
("GLAuth", "LDAP server (dev only) — glauth.exe on localhost:3893", false),
|
||||
];
|
||||
|
||||
// -------- Orchestrator --------
|
||||
|
||||
public static async Task<PrerequisiteReport> CheckAllAsync(
|
||||
Options? options = null, CancellationToken ct = default)
|
||||
{
|
||||
options ??= new Options();
|
||||
var checks = new List<PrerequisiteCheck>();
|
||||
|
||||
// Environment
|
||||
checks.Add(MxAccessComProbe.CheckProcessBitness());
|
||||
|
||||
// AvevaInstall — registry + files
|
||||
checks.Add(RegistryProbe.CheckFrameworkInstalled());
|
||||
checks.Add(RegistryProbe.CheckPlatformDeployed());
|
||||
checks.Add(RegistryProbe.CheckRebootPending());
|
||||
|
||||
// AvevaCoreService
|
||||
foreach (var (name, purpose) in CoreServices)
|
||||
checks.Add(ServiceProbe.Check(name, PrerequisiteCategory.AvevaCoreService, hardRequired: true, whatItDoes: purpose));
|
||||
|
||||
// MxAccessCom
|
||||
checks.Add(MxAccessComProbe.Check());
|
||||
|
||||
// GalaxyRepository
|
||||
checks.Add(await SqlProbe.CheckZbDatabaseAsync(options.SqlConnectionString, ct));
|
||||
// Deployed-object count only makes sense if the DB check passed.
|
||||
if (checks[checks.Count - 1].Status == PrerequisiteStatus.Pass)
|
||||
checks.Add(await SqlProbe.CheckDeployedObjectCountAsync(options.SqlConnectionString, ct));
|
||||
|
||||
// OtOpcUaService
|
||||
foreach (var (name, purpose, hard) in OtOpcUaServices)
|
||||
checks.Add(ServiceProbe.Check(name, PrerequisiteCategory.OtOpcUaService, hardRequired: hard, whatItDoes: purpose));
|
||||
if (options.CheckGalaxyHostPipe)
|
||||
checks.Add(await NamedPipeProbe.CheckGalaxyHostPipeAsync(options.GalaxyHostPipeName, ct));
|
||||
|
||||
// AvevaSoftService
|
||||
foreach (var (name, purpose) in SoftServices)
|
||||
checks.Add(ServiceProbe.Check(name, PrerequisiteCategory.AvevaSoftService, hardRequired: false, whatItDoes: purpose));
|
||||
|
||||
// AvevaHistorian
|
||||
if (options.CheckHistorian)
|
||||
{
|
||||
foreach (var (name, purpose) in HistorianServices)
|
||||
checks.Add(ServiceProbe.Check(name, PrerequisiteCategory.AvevaHistorian, hardRequired: false, whatItDoes: purpose));
|
||||
}
|
||||
|
||||
return new PrerequisiteReport(checks);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Narrower check for tests that only need the Galaxy Repository (SQL) path — don't
|
||||
/// pay the cost of probing every aa* service when the test only reads gobject rows.
|
||||
/// </summary>
|
||||
public static async Task<PrerequisiteReport> CheckRepositoryOnlyAsync(
|
||||
string? sqlConnectionString = null, CancellationToken ct = default)
|
||||
{
|
||||
var checks = new List<PrerequisiteCheck>
|
||||
{
|
||||
await SqlProbe.CheckZbDatabaseAsync(sqlConnectionString, ct),
|
||||
};
|
||||
if (checks[0].Status == PrerequisiteStatus.Pass)
|
||||
checks.Add(await SqlProbe.CheckDeployedObjectCountAsync(sqlConnectionString, ct));
|
||||
return new PrerequisiteReport(checks);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Narrower check for the named-pipe endpoint — tests that drive the full Proxy
|
||||
/// against a live Galaxy.Host service don't need the SQL or AVEVA-internal probes
|
||||
/// (the Host does that work internally; we just need the pipe to accept).
|
||||
/// </summary>
|
||||
public static async Task<PrerequisiteReport> CheckGalaxyHostPipeOnlyAsync(
|
||||
string? pipeName = null, CancellationToken ct = default)
|
||||
{
|
||||
var checks = new List<PrerequisiteCheck>
|
||||
{
|
||||
await NamedPipeProbe.CheckGalaxyHostPipeAsync(pipeName, ct),
|
||||
};
|
||||
return new PrerequisiteReport(checks);
|
||||
}
|
||||
|
||||
/// <summary>Knobs for <see cref="CheckAllAsync"/>.</summary>
|
||||
public sealed class Options
|
||||
{
|
||||
/// <summary>SQL Server connection string — defaults to Windows-auth <c>localhost\ZB</c>.</summary>
|
||||
public string? SqlConnectionString { get; init; }
|
||||
|
||||
/// <summary>Named-pipe endpoint for OtOpcUaGalaxyHost — defaults to <c>OtOpcUaGalaxy</c>.</summary>
|
||||
public string? GalaxyHostPipeName { get; init; }
|
||||
|
||||
/// <summary>Include the named-pipe probe. Off by default — it's a seconds-long TCP-like probe and some tests don't need it.</summary>
|
||||
public bool CheckGalaxyHostPipe { get; init; } = true;
|
||||
|
||||
/// <summary>Include Historian service probes. Off by default — Historian is optional.</summary>
|
||||
public bool CheckHistorian { get; init; } = false;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,26 @@
|
||||
#if NET48
|
||||
// Polyfills for C# 9+ language features that the helper uses but that net48 BCL doesn't
|
||||
// provide. Keeps the sources single-target-free at the language level — the same .cs files
|
||||
// build on both frameworks without preprocessor guards in the callsites.
|
||||
|
||||
namespace System.Runtime.CompilerServices
|
||||
{
|
||||
/// <summary>Required by C# 9 <c>init</c>-only setters and <c>record</c> types.</summary>
|
||||
internal static class IsExternalInit { }
|
||||
}
|
||||
|
||||
namespace System.Runtime.Versioning
|
||||
{
|
||||
/// <summary>
|
||||
/// Minimal shim for the .NET 5+ <c>SupportedOSPlatformAttribute</c>. Pure marker for the
|
||||
/// compiler on net10; on net48 we still want the attribute to exist so the same
|
||||
/// <c>[SupportedOSPlatform("windows")]</c> source compiles. The attribute is internal
|
||||
/// and attribute-targets-everything to minimize surface.
|
||||
/// </summary>
|
||||
[AttributeUsage(AttributeTargets.All, Inherited = false, AllowMultiple = true)]
|
||||
internal sealed class SupportedOSPlatformAttribute(string platformName) : Attribute
|
||||
{
|
||||
public string PlatformName { get; } = platformName;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
@@ -0,0 +1,44 @@
|
||||
namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy.TestSupport;
|
||||
|
||||
/// <summary>One prerequisite probe's outcome. <see cref="AvevaPrerequisites"/> returns many of these.</summary>
|
||||
/// <param name="Name">Short diagnostic id — e.g. <c>service:aaBootstrap</c>, <c>sql:ZB</c>, <c>registry:ArchestrA.Framework</c>.</param>
|
||||
/// <param name="Category">Which subsystem the probe belongs to — lets callers filter (e.g. "Historian warns don't gate the core Galaxy smoke").</param>
|
||||
/// <param name="Status">Outcome.</param>
|
||||
/// <param name="Detail">One-line specific message an operator can act on — <c>"aaGR not installed — install the Galaxy Repository role from the System Platform setup"</c> beats <c>"failed"</c>.</param>
|
||||
public sealed record PrerequisiteCheck(
|
||||
string Name,
|
||||
PrerequisiteCategory Category,
|
||||
PrerequisiteStatus Status,
|
||||
string Detail);
|
||||
|
||||
public enum PrerequisiteStatus
|
||||
{
|
||||
/// <summary>Prerequisite is met; no action needed.</summary>
|
||||
Pass,
|
||||
/// <summary>Soft dependency missing — stack still runs but some feature (e.g. logging) is degraded.</summary>
|
||||
Warn,
|
||||
/// <summary>Hard dependency missing — live tests can't proceed; <see cref="PrerequisiteReport.SkipReason"/> surfaces this.</summary>
|
||||
Fail,
|
||||
/// <summary>Probe wasn't applicable in this environment (e.g. non-Windows host, Historian not installed).</summary>
|
||||
Skip,
|
||||
}
|
||||
|
||||
public enum PrerequisiteCategory
|
||||
{
|
||||
/// <summary>Platform sanity — process bitness, OS platform, DCOM/RPCSS.</summary>
|
||||
Environment,
|
||||
/// <summary>Hard-required AVEVA Windows services (aaBootstrap, aaGR, NmxSvc).</summary>
|
||||
AvevaCoreService,
|
||||
/// <summary>Soft-required AVEVA Windows services (aaLogger, aaUserValidator) — warn only.</summary>
|
||||
AvevaSoftService,
|
||||
/// <summary>ArchestrA Framework install markers (registry + files).</summary>
|
||||
AvevaInstall,
|
||||
/// <summary>MXAccess COM server registration + file on disk.</summary>
|
||||
MxAccessCom,
|
||||
/// <summary>SQL Server reachability + ZB database presence + deployed-object count.</summary>
|
||||
GalaxyRepository,
|
||||
/// <summary>Historian services (optional — only required for HistoryRead IPC paths).</summary>
|
||||
AvevaHistorian,
|
||||
/// <summary>OtOpcUa-side services (OtOpcUa, OtOpcUaGalaxyHost) + third-party deps (GLAuth).</summary>
|
||||
OtOpcUaService,
|
||||
}
|
||||
@@ -0,0 +1,94 @@
|
||||
using System.Text;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy.TestSupport;
|
||||
|
||||
/// <summary>
|
||||
/// Aggregated result of an <see cref="AvevaPrerequisites.CheckAll"/> run. Test fixtures
|
||||
/// typically call <see cref="SkipReason"/> to produce the argument for xUnit's
|
||||
/// <c>Assert.Skip</c> when any hard dependency failed.
|
||||
/// </summary>
|
||||
public sealed class PrerequisiteReport
|
||||
{
|
||||
public IReadOnlyList<PrerequisiteCheck> Checks { get; }
|
||||
|
||||
public PrerequisiteReport(IEnumerable<PrerequisiteCheck> checks)
|
||||
{
|
||||
Checks = [.. checks];
|
||||
}
|
||||
|
||||
/// <summary>True when every probe is Pass / Warn / Skip — no Fail entries.</summary>
|
||||
public bool IsLivetestReady => !Checks.Any(c => c.Status == PrerequisiteStatus.Fail);
|
||||
|
||||
/// <summary>
|
||||
/// True when only the AVEVA-side probes pass — ignores failures in the
|
||||
/// <see cref="PrerequisiteCategory.OtOpcUaService"/> category. Lets a live-test gate
|
||||
/// say "AVEVA is ready even if the v2 services aren't installed yet" without
|
||||
/// conflating the two. Useful for tests that exercise Galaxy directly (e.g.
|
||||
/// <see cref="GalaxyRepositoryLiveSmokeTests"/>) rather than through our stack.
|
||||
/// </summary>
|
||||
public bool IsAvevaSideReady =>
|
||||
!Checks.Any(c => c.Status == PrerequisiteStatus.Fail && c.Category != PrerequisiteCategory.OtOpcUaService);
|
||||
|
||||
/// <summary>
|
||||
/// Multi-line message for <c>Assert.Skip</c> when a hard dependency isn't met. Returns
|
||||
/// null when <see cref="IsLivetestReady"/> is true.
|
||||
/// </summary>
|
||||
public string? SkipReason
|
||||
{
|
||||
get
|
||||
{
|
||||
var fails = Checks.Where(c => c.Status == PrerequisiteStatus.Fail).ToList();
|
||||
if (fails.Count == 0) return null;
|
||||
|
||||
var sb = new StringBuilder();
|
||||
sb.AppendLine($"Live-AVEVA prerequisites not met ({fails.Count} failed):");
|
||||
foreach (var f in fails)
|
||||
sb.AppendLine($" • [{f.Category}] {f.Name} — {f.Detail}");
|
||||
sb.Append("Run `Get-Service aa*` / `sqlcmd -S localhost -d ZB -E -Q \"SELECT 1\"` to triage.");
|
||||
return sb.ToString();
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Human-readable summary of warnings — caller decides whether to log or ignore. Useful
|
||||
/// when a live test does pass but an operator should know their environment is degraded.
|
||||
/// </summary>
|
||||
public string? Warnings
|
||||
{
|
||||
get
|
||||
{
|
||||
var warns = Checks.Where(c => c.Status == PrerequisiteStatus.Warn).ToList();
|
||||
if (warns.Count == 0) return null;
|
||||
|
||||
var sb = new StringBuilder();
|
||||
sb.AppendLine($"AVEVA prerequisites with warnings ({warns.Count}):");
|
||||
foreach (var w in warns)
|
||||
sb.AppendLine($" • [{w.Category}] {w.Name} — {w.Detail}");
|
||||
return sb.ToString();
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Throw <see cref="InvalidOperationException"/> if any <paramref name="categories"/>
|
||||
/// contain a Fail — useful when a specific test needs, say, Galaxy Repository but doesn't
|
||||
/// care about Historian. Call before <c>Assert.Skip</c> if you want to be strict.
|
||||
/// </summary>
|
||||
public void RequireCategories(params PrerequisiteCategory[] categories)
|
||||
{
|
||||
var set = categories.ToHashSet();
|
||||
var fails = Checks.Where(c => c.Status == PrerequisiteStatus.Fail && set.Contains(c.Category)).ToList();
|
||||
if (fails.Count == 0) return;
|
||||
|
||||
var detail = string.Join("; ", fails.Select(f => $"{f.Name}: {f.Detail}"));
|
||||
throw new InvalidOperationException($"Required prerequisite categories failed: {detail}");
|
||||
}
|
||||
|
||||
public override string ToString()
|
||||
{
|
||||
var sb = new StringBuilder();
|
||||
sb.AppendLine($"PrerequisiteReport: {Checks.Count} checks");
|
||||
foreach (var c in Checks)
|
||||
sb.AppendLine($" [{c.Status,-4}] {c.Category}/{c.Name}: {c.Detail}");
|
||||
return sb.ToString();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,102 @@
|
||||
using System.Runtime.InteropServices;
|
||||
using System.Runtime.Versioning;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy.TestSupport.Probes;
|
||||
|
||||
/// <summary>
|
||||
/// Confirms MXAccess COM server registration by resolving the
|
||||
/// <c>LMXProxy.LMXProxyServer</c> ProgID to its CLSID, then checking that the CLSID's
|
||||
/// 32-bit <c>InprocServer32</c> entry points at a file that exists on disk.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// A common failure mode on partial installs: ProgID is registered but the CLSID
|
||||
/// InprocServer32 DLL is missing (previous install uninstalled but registry orphan remains).
|
||||
/// This probe surfaces that case with an actionable message instead of the
|
||||
/// <c>0x80040154 REGDB_E_CLASSNOTREG</c> you'd see from a late COM activation failure.
|
||||
/// </remarks>
|
||||
public static class MxAccessComProbe
|
||||
{
|
||||
public const string ProgId = "LMXProxy.LMXProxyServer";
|
||||
public const string VersionedProgId = "LMXProxy.LMXProxyServer.1";
|
||||
|
||||
public static PrerequisiteCheck Check()
|
||||
{
|
||||
if (!RuntimeInformation.IsOSPlatform(OSPlatform.Windows))
|
||||
{
|
||||
return new PrerequisiteCheck("com:LMXProxy", PrerequisiteCategory.MxAccessCom,
|
||||
PrerequisiteStatus.Skip, "COM registration probes only run on Windows.");
|
||||
}
|
||||
return CheckWindows();
|
||||
}
|
||||
|
||||
[SupportedOSPlatform("windows")]
|
||||
private static PrerequisiteCheck CheckWindows()
|
||||
{
|
||||
try
|
||||
{
|
||||
var (clsid, dll) = RegistryProbe.ResolveProgIdToInproc(ProgId);
|
||||
if (clsid is null)
|
||||
{
|
||||
return new PrerequisiteCheck("com:LMXProxy", PrerequisiteCategory.MxAccessCom,
|
||||
PrerequisiteStatus.Fail,
|
||||
$"ProgID {ProgId} not registered — MXAccess COM server isn't installed. " +
|
||||
$"Install System Platform's MXAccess component and re-run.");
|
||||
}
|
||||
|
||||
if (string.IsNullOrWhiteSpace(dll))
|
||||
{
|
||||
return new PrerequisiteCheck("com:LMXProxy", PrerequisiteCategory.MxAccessCom,
|
||||
PrerequisiteStatus.Fail,
|
||||
$"ProgID {ProgId} → CLSID {clsid} but InprocServer32 is empty. " +
|
||||
$"Registry is orphaned; re-register with: regsvr32 /s LmxProxy.dll (from an elevated cmd in the Framework bin dir).");
|
||||
}
|
||||
|
||||
// Resolve the recorded path — sometimes registered as a bare filename that the COM
|
||||
// runtime resolves via the current process's DLL-search path. Accept either an
|
||||
// absolute path that exists, or a bare filename whose resolution we can't verify
|
||||
// without loading it (treat as Pass-with-note).
|
||||
if (Path.IsPathRooted(dll))
|
||||
{
|
||||
if (!File.Exists(dll))
|
||||
{
|
||||
return new PrerequisiteCheck("com:LMXProxy", PrerequisiteCategory.MxAccessCom,
|
||||
PrerequisiteStatus.Fail,
|
||||
$"ProgID {ProgId} → CLSID {clsid} → InprocServer32 {dll}, but the file is missing. " +
|
||||
$"Re-install the Framework or restore from backup.");
|
||||
}
|
||||
return new PrerequisiteCheck("com:LMXProxy", PrerequisiteCategory.MxAccessCom,
|
||||
PrerequisiteStatus.Pass,
|
||||
$"ProgID {ProgId} → {dll} (file exists).");
|
||||
}
|
||||
|
||||
return new PrerequisiteCheck("com:LMXProxy", PrerequisiteCategory.MxAccessCom,
|
||||
PrerequisiteStatus.Pass,
|
||||
$"ProgID {ProgId} → {dll} (bare filename — relies on PATH resolution at COM activation time).");
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
return new PrerequisiteCheck("com:LMXProxy", PrerequisiteCategory.MxAccessCom,
|
||||
PrerequisiteStatus.Warn,
|
||||
$"Probe failed: {ex.GetType().Name}: {ex.Message}");
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Warn when running as a 64-bit process — MXAccess COM activation will fail with
|
||||
/// <c>0x80040154</c> regardless of registration state. The production drivers run net48
|
||||
/// x86; xunit hosts run 64-bit by default so this often surfaces first.
|
||||
/// </summary>
|
||||
public static PrerequisiteCheck CheckProcessBitness()
|
||||
{
|
||||
if (Environment.Is64BitProcess)
|
||||
{
|
||||
return new PrerequisiteCheck("env:ProcessBitness", PrerequisiteCategory.Environment,
|
||||
PrerequisiteStatus.Warn,
|
||||
"Test host is 64-bit. Direct MXAccess COM activation would fail with REGDB_E_CLASSNOTREG (0x80040154); " +
|
||||
"the production driver workaround is to run Galaxy.Host as a 32-bit process. Tests that only " +
|
||||
"talk to the Host service over the named pipe aren't affected.");
|
||||
}
|
||||
return new PrerequisiteCheck("env:ProcessBitness", PrerequisiteCategory.Environment,
|
||||
PrerequisiteStatus.Pass, "Test host is 32-bit.");
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,59 @@
|
||||
using System.IO.Pipes;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy.TestSupport.Probes;
|
||||
|
||||
/// <summary>
|
||||
/// Verifies the <c>OtOpcUaGalaxyHost</c> named-pipe endpoint is accepting connections —
|
||||
/// the handshake the Proxy performs at boot. A clean pipe connect without sending any
|
||||
/// framed message proves the Host service is listening; we disconnect immediately so we
|
||||
/// don't consume a session slot.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// Default pipe name matches the installer script's <c>OTOPCUA_GALAXY_PIPE</c> default.
|
||||
/// Override when the Host service was installed with a non-default name (custom deployments).
|
||||
/// </remarks>
|
||||
public static class NamedPipeProbe
|
||||
{
|
||||
public const string DefaultGalaxyHostPipeName = "OtOpcUaGalaxy";
|
||||
|
||||
public static async Task<PrerequisiteCheck> CheckGalaxyHostPipeAsync(
|
||||
string? pipeName = null, CancellationToken ct = default)
|
||||
{
|
||||
pipeName ??= DefaultGalaxyHostPipeName;
|
||||
try
|
||||
{
|
||||
using var client = new NamedPipeClientStream(
|
||||
serverName: ".",
|
||||
pipeName: pipeName,
|
||||
direction: PipeDirection.InOut,
|
||||
options: PipeOptions.Asynchronous);
|
||||
using var cts = CancellationTokenSource.CreateLinkedTokenSource(ct);
|
||||
cts.CancelAfter(TimeSpan.FromSeconds(2));
|
||||
await client.ConnectAsync(cts.Token);
|
||||
|
||||
return new PrerequisiteCheck("pipe:OtOpcUaGalaxyHost", PrerequisiteCategory.OtOpcUaService,
|
||||
PrerequisiteStatus.Pass,
|
||||
$@"Pipe \\.\pipe\{pipeName} accepted a connection — OtOpcUaGalaxyHost is listening.");
|
||||
}
|
||||
catch (OperationCanceledException)
|
||||
{
|
||||
return new PrerequisiteCheck("pipe:OtOpcUaGalaxyHost", PrerequisiteCategory.OtOpcUaService,
|
||||
PrerequisiteStatus.Fail,
|
||||
$@"Pipe \\.\pipe\{pipeName} not connectable within 2s — OtOpcUaGalaxyHost service isn't running. " +
|
||||
"Start with: sc.exe start OtOpcUaGalaxyHost");
|
||||
}
|
||||
catch (TimeoutException)
|
||||
{
|
||||
return new PrerequisiteCheck("pipe:OtOpcUaGalaxyHost", PrerequisiteCategory.OtOpcUaService,
|
||||
PrerequisiteStatus.Fail,
|
||||
$@"Pipe \\.\pipe\{pipeName} connect timed out — service may be starting or stuck. " +
|
||||
"Check: sc.exe query OtOpcUaGalaxyHost");
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
return new PrerequisiteCheck("pipe:OtOpcUaGalaxyHost", PrerequisiteCategory.OtOpcUaService,
|
||||
PrerequisiteStatus.Fail,
|
||||
$@"Pipe \\.\pipe\{pipeName} connect failed: {ex.GetType().Name}: {ex.Message}");
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,162 @@
|
||||
using System.Runtime.InteropServices;
|
||||
using System.Runtime.Versioning;
|
||||
using Microsoft.Win32;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy.TestSupport.Probes;
|
||||
|
||||
/// <summary>
|
||||
/// Reads HKLM registry keys to confirm ArchestrA Framework / System Platform install
|
||||
/// markers. Matches the registered paths documented in
|
||||
/// <c>docs/v2/implementation/</c> — System Platform is 32-bit so keys live under
|
||||
/// <c>HKLM\SOFTWARE\WOW6432Node\ArchestrA\...</c>.
|
||||
/// </summary>
|
||||
public static class RegistryProbe
|
||||
{
|
||||
// Canonical install roots per the research on our dev box (System Platform 2020 R2).
|
||||
public const string ArchestrARootKey = @"SOFTWARE\WOW6432Node\ArchestrA";
|
||||
public const string FrameworkKey = @"SOFTWARE\WOW6432Node\ArchestrA\Framework";
|
||||
public const string PlatformKey = @"SOFTWARE\WOW6432Node\ArchestrA\Framework\Platform";
|
||||
public const string MsiInstallKey = @"SOFTWARE\WOW6432Node\ArchestrA\MSIInstall";
|
||||
|
||||
public static PrerequisiteCheck CheckFrameworkInstalled()
|
||||
{
|
||||
if (!RuntimeInformation.IsOSPlatform(OSPlatform.Windows))
|
||||
{
|
||||
return new PrerequisiteCheck("registry:ArchestrA.Framework", PrerequisiteCategory.AvevaInstall,
|
||||
PrerequisiteStatus.Skip, "Registry probes only run on Windows.");
|
||||
}
|
||||
return FrameworkInstalledWindows();
|
||||
}
|
||||
|
||||
public static PrerequisiteCheck CheckPlatformDeployed()
|
||||
{
|
||||
if (!RuntimeInformation.IsOSPlatform(OSPlatform.Windows))
|
||||
{
|
||||
return new PrerequisiteCheck("registry:ArchestrA.Platform", PrerequisiteCategory.AvevaInstall,
|
||||
PrerequisiteStatus.Skip, "Registry probes only run on Windows.");
|
||||
}
|
||||
return PlatformDeployedWindows();
|
||||
}
|
||||
|
||||
public static PrerequisiteCheck CheckRebootPending()
|
||||
{
|
||||
if (!RuntimeInformation.IsOSPlatform(OSPlatform.Windows))
|
||||
{
|
||||
return new PrerequisiteCheck("registry:ArchestrA.RebootPending", PrerequisiteCategory.AvevaInstall,
|
||||
PrerequisiteStatus.Skip, "Registry probes only run on Windows.");
|
||||
}
|
||||
return RebootPendingWindows();
|
||||
}
|
||||
|
||||
[SupportedOSPlatform("windows")]
|
||||
private static PrerequisiteCheck FrameworkInstalledWindows()
|
||||
{
|
||||
try
|
||||
{
|
||||
using var key = Registry.LocalMachine.OpenSubKey(FrameworkKey);
|
||||
if (key is null)
|
||||
{
|
||||
return new PrerequisiteCheck("registry:ArchestrA.Framework", PrerequisiteCategory.AvevaInstall,
|
||||
PrerequisiteStatus.Fail,
|
||||
$"Missing {FrameworkKey} — ArchestrA Framework isn't installed. Install AVEVA System Platform from the setup media.");
|
||||
}
|
||||
|
||||
var installPath = key.GetValue("InstallPath") as string;
|
||||
var rootPath = key.GetValue("RootPath") as string;
|
||||
if (string.IsNullOrWhiteSpace(installPath) || string.IsNullOrWhiteSpace(rootPath))
|
||||
{
|
||||
return new PrerequisiteCheck("registry:ArchestrA.Framework", PrerequisiteCategory.AvevaInstall,
|
||||
PrerequisiteStatus.Warn,
|
||||
$"Framework key exists but InstallPath/RootPath values missing — install may be incomplete.");
|
||||
}
|
||||
|
||||
return new PrerequisiteCheck("registry:ArchestrA.Framework", PrerequisiteCategory.AvevaInstall,
|
||||
PrerequisiteStatus.Pass,
|
||||
$"Installed at {installPath} (RootPath {rootPath}).");
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
return new PrerequisiteCheck("registry:ArchestrA.Framework", PrerequisiteCategory.AvevaInstall,
|
||||
PrerequisiteStatus.Warn,
|
||||
$"Probe failed: {ex.GetType().Name}: {ex.Message}");
|
||||
}
|
||||
}
|
||||
|
||||
[SupportedOSPlatform("windows")]
|
||||
private static PrerequisiteCheck PlatformDeployedWindows()
|
||||
{
|
||||
try
|
||||
{
|
||||
using var key = Registry.LocalMachine.OpenSubKey(PlatformKey);
|
||||
var pfeConfig = key?.GetValue("PfeConfigOptions") as string;
|
||||
if (string.IsNullOrWhiteSpace(pfeConfig))
|
||||
{
|
||||
return new PrerequisiteCheck("registry:ArchestrA.Platform.Deployed", PrerequisiteCategory.AvevaInstall,
|
||||
PrerequisiteStatus.Warn,
|
||||
$"No Platform object deployed locally (Platform\\PfeConfigOptions empty). MXAccess will connect but subscriptions will fail. Deploy a Platform from the IDE.");
|
||||
}
|
||||
|
||||
// PfeConfigOptions format: "PlatformId=N,EngineId=N,EngineName=...,..."
|
||||
// A non-deployed state leaves PlatformId=0 or the key empty.
|
||||
if (pfeConfig.Contains("PlatformId=0,", StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
return new PrerequisiteCheck("registry:ArchestrA.Platform.Deployed", PrerequisiteCategory.AvevaInstall,
|
||||
PrerequisiteStatus.Warn,
|
||||
$"Platform never deployed (PfeConfigOptions has PlatformId=0). Deploy a Platform from the IDE before running live tests.");
|
||||
}
|
||||
|
||||
return new PrerequisiteCheck("registry:ArchestrA.Platform.Deployed", PrerequisiteCategory.AvevaInstall,
|
||||
PrerequisiteStatus.Pass,
|
||||
$"Platform deployed ({pfeConfig}).");
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
return new PrerequisiteCheck("registry:ArchestrA.Platform.Deployed", PrerequisiteCategory.AvevaInstall,
|
||||
PrerequisiteStatus.Warn,
|
||||
$"Probe failed: {ex.GetType().Name}: {ex.Message}");
|
||||
}
|
||||
}
|
||||
|
||||
[SupportedOSPlatform("windows")]
|
||||
private static PrerequisiteCheck RebootPendingWindows()
|
||||
{
|
||||
try
|
||||
{
|
||||
using var key = Registry.LocalMachine.OpenSubKey(MsiInstallKey);
|
||||
var rebootRequired = key?.GetValue("RebootRequired") as string;
|
||||
if (string.Equals(rebootRequired, "True", StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
return new PrerequisiteCheck("registry:ArchestrA.RebootPending", PrerequisiteCategory.AvevaInstall,
|
||||
PrerequisiteStatus.Warn,
|
||||
"An ArchestrA patch has been installed but the machine hasn't rebooted. Post-patch behavior is undefined until a reboot.");
|
||||
}
|
||||
return new PrerequisiteCheck("registry:ArchestrA.RebootPending", PrerequisiteCategory.AvevaInstall,
|
||||
PrerequisiteStatus.Pass,
|
||||
"No pending reboot flagged.");
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
return new PrerequisiteCheck("registry:ArchestrA.RebootPending", PrerequisiteCategory.AvevaInstall,
|
||||
PrerequisiteStatus.Warn,
|
||||
$"Probe failed: {ex.GetType().Name}: {ex.Message}");
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Read the registered <see cref="ComProgIdCheck"/> CLSID for the given ProgID and
|
||||
/// resolve the 32-bit <c>InprocServer32</c> file path. Returns null when either is missing.
|
||||
/// </summary>
|
||||
[SupportedOSPlatform("windows")]
|
||||
internal static (string? Clsid, string? InprocDllPath) ResolveProgIdToInproc(string progId)
|
||||
{
|
||||
using var progIdKey = Registry.ClassesRoot.OpenSubKey($@"{progId}\CLSID");
|
||||
var clsid = progIdKey?.GetValue(null) as string;
|
||||
if (string.IsNullOrWhiteSpace(clsid)) return (null, null);
|
||||
|
||||
// 32-bit COM server under Wow6432Node\CLSID\{guid}\InprocServer32 default value.
|
||||
using var inproc = Registry.LocalMachine.OpenSubKey(
|
||||
$@"SOFTWARE\Classes\WOW6432Node\CLSID\{clsid}\InprocServer32");
|
||||
var dll = inproc?.GetValue(null) as string;
|
||||
return (clsid, dll);
|
||||
}
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user