17 Commits

Author SHA1 Message Date
Joseph Doherty 1c579410cd fix(runtime): flag cross-cluster orphan-equipment bindings on rebuild
v2-ci / build (push) Failing after 42s
v2-ci / unit-tests (tests/Core/ZB.MOM.WW.OtOpcUa.Cluster.Tests) (push) Has been skipped
v2-ci / unit-tests (tests/Server/ZB.MOM.WW.OtOpcUa.ControlPlane.Tests) (push) Has been skipped
v2-ci / unit-tests (tests/Server/ZB.MOM.WW.OtOpcUa.OpcUaServer.Tests) (push) Has been skipped
v2-ci / unit-tests (tests/Server/ZB.MOM.WW.OtOpcUa.Runtime.Tests) (push) Has been skipped
v2-ci / unit-tests (tests/Server/ZB.MOM.WW.OtOpcUa.Security.Tests) (push) Has been skipped
v2-ci / integration (tests/Server/ZB.MOM.WW.OtOpcUa.Host.IntegrationTests) (push) Has been skipped
v2-ci / integration (tests/Server/ZB.MOM.WW.OtOpcUa.OpcUaServer.IntegrationTests) (push) Has been skipped
ParseComposition(blob, nodeId, onInconsistency?) detects a kept equipment whose
UNS line belongs to another cluster (a same-cluster-invariant violation that
would orphan the equipment folder) and reports it via an optional callback,
wired to OpcUaPublishActor's logger. Detection-only; the upstream draft
validator remains the authority. Adds two unit tests.
2026-06-07 08:24:11 -04:00
Joseph Doherty b0a62a9f3b fix(docker-dev): self-bootstrap schema via one-shot migrator (fixes fresh-volume quirks)
Adds a 'migrator' Dockerfile stage + Compose service that runs 'dotnet ef
database update' once on bring-up, so a fresh SQL volume gets the schema with no
operator step (quirk 1). cluster-seed + every host node depend on it via
service_completed_successfully, so the seed never races an in-progress migration
(quirk 2). Host build pinned to target: runtime (the migrator is now the last
stage). entrypoint + README updated; the manual 'dotnet ef' first-time step is
gone. Verified: down -v + up --build self-bootstraps (migrator+seed exit 0,
6 nodes up), deploy Sealed 6/6.
2026-06-07 08:20:56 -04:00
Joseph Doherty 1f76eac97a fix(controlplane): case-insensitive NodeId equality for deploy ack-set
Aligns ConfigPublishCoordinator's _acks/_expectedAcks with the case-insensitive
ClusterId/NodeId scoping in DeploymentArtifact.ResolveClusterScope, so an ack
from a node whose host:port differs only in case still matches its expected-ack
entry (SQL collation + DNS are case-insensitive).
2026-06-07 08:20:56 -04:00
Joseph Doherty b45e0be427 docs(docker-dev): document first-time DB migrate + reseed (fresh-volume bootstrap) 2026-06-07 03:56:33 -04:00
Joseph Doherty e4a3f07c99 test(integration): poll per-node diagnostics to remove timing fragility (review) 2026-06-07 03:37:04 -04:00
Joseph Doherty b88ae5db10 docs(docker-dev): document single-mesh hub-and-spoke topology
Rewrite docker-dev/README.md and update docker-dev/Dockerfile comment
to reflect the new topology: one Akka mesh seeded by central-1/central-2
(fused admin+driver, MAIN cluster, single UI at http://localhost:9200),
with site-a-*/site-b-* as driver-only members scoped by ClusterId.
Removes all references to the old three-mesh layout (admin-a, admin-b,
driver-a, driver-b, site-a.localhost, site-b.localhost).
2026-06-07 03:34:49 -04:00
Joseph Doherty ec9599e234 test(integration): multi-cluster deploy scopes drivers per node 2026-06-07 03:31:08 -04:00
Joseph Doherty 8ce57e47a3 feat(runtime): OPC UA rebuild materialises only the node's ClusterId slice 2026-06-07 03:23:02 -04:00
Joseph Doherty 1b7f995aea feat(runtime): DriverHost spawns + subscribes only its own ClusterId's drivers 2026-06-07 03:19:22 -04:00
Joseph Doherty 4fca4e1aca feat(runtime): node-scoped ParseComposition filters address space by ClusterId 2026-06-07 03:15:46 -04:00
Joseph Doherty 7b2f64fdb8 refactor(runtime): case-insensitive ClusterId/NodeId match + suppress short-circuit + edge tests (review) 2026-06-07 03:12:09 -04:00
Joseph Doherty 05471dc36c feat(docker-dev): seed MAIN ClusterNodes as central-1/central-2 2026-06-07 03:09:16 -04:00
Joseph Doherty 7bba86b2af feat(docker-dev): Traefik routes only the central cluster UI 2026-06-07 03:08:26 -04:00
Joseph Doherty 5f48f81d5a feat(docker-dev): single-mesh hub-and-spoke (central-1/2 + driver-only sites) 2026-06-07 03:08:17 -04:00
Joseph Doherty 24796f2c12 feat(runtime): ClusterId scope resolution + node-scoped driver-spec parse 2026-06-07 03:05:02 -04:00
Joseph Doherty 7bec2fd4db docs(plan): per-ClusterId scoping implementation plan + task graph
10 tasks: runtime scoping (ResolveClusterScope + scoped ParseDriverInstances/
ParseComposition, DriverHostActor + OpcUaPublishActor wiring, multi-cluster
E2E) then docker-dev compose/traefik/seed rewrite, live verification, docs.
2026-06-07 02:59:46 -04:00
Joseph Doherty ab8900eee5 docs(design): per-ClusterId scoping for hub-and-spoke single mesh
Central cluster (2 fused admin+driver nodes) hosts the only UI + deploy
singleton; site clusters (2 driver-only nodes each) join the central mesh
and are logically separated by ClusterId. Each node applies only its own
cluster's drivers + address space on a global deploy. Approved design;
next step is the implementation plan.
2026-06-07 02:50:49 -04:00
18 changed files with 2090 additions and 320 deletions
+17 -1
View File
@@ -1,6 +1,7 @@
# Multi-stage build of OtOpcUa.Host targeting linux-x64. Used by docker-dev/docker-compose.yml
# to spin four host containers (admin-a, admin-b, driver-a, driver-b) from a single image —
# to spin six host containers (central-1, central-2, site-a-1, site-a-2, site-b-1, site-b-2) from a single image —
# Compose drives OTOPCUA_ROLES + Cluster:* env per container to differentiate them.
# A separate `migrator` stage (below) applies EF migrations once on bring-up.
FROM mcr.microsoft.com/dotnet/sdk:10.0 AS build
WORKDIR /src
@@ -18,3 +19,18 @@ EXPOSE 4053
EXPOSE 4840
ENTRYPOINT ["dotnet", "OtOpcUa.Host.dll"]
# ── Migrator (one-shot) ──────────────────────────────────────────────────────
# Applies EF Core migrations to the ConfigDb so a fresh SQL volume gets the schema
# with no operator step. docker-dev compose runs this once, before cluster-seed +
# the host nodes (they depend on it via service_completed_successfully). The host
# nodes deliberately do NOT auto-migrate (production owns schema changes), so this
# rig-only stage carries that responsibility. The connection comes from the
# OTOPCUA_CONFIG_CONNECTION env var read by DesignTimeDbContextFactory.
FROM build AS migrator
RUN dotnet tool install --global dotnet-ef --version 10.0.7
ENV PATH="${PATH}:/root/.dotnet/tools"
WORKDIR /src
ENTRYPOINT ["dotnet", "ef", "database", "update", \
"--project", "src/Core/ZB.MOM.WW.OtOpcUa.Configuration", \
"--startup-project", "src/Core/ZB.MOM.WW.OtOpcUa.Configuration"]
+41 -38
View File
@@ -1,6 +1,6 @@
# docker-dev
Mac-friendly multi-cluster OtOpcUa fleet for manual UI exercise + integration smoke tests. Spins up **three isolated Akka clusters** + SQL Server + Traefik on the same Compose network. All three clusters share the single `OtOpcUa` ConfigDb — multi-tenancy is enforced by per-row `ServerCluster.ClusterId` scoping. Akka.Cluster gossip stays isolated between meshes because their seed-node lists are disjoint, even though they share the same system name `otopcua`.
Mac-friendly OtOpcUa fleet for manual UI exercise + integration smoke tests. Spins up **one single Akka mesh** (hub-and-spoke topology) + SQL Server + Traefik on the same Compose network. All six host nodes share the single `OtOpcUa` ConfigDb — logical separation between MAIN, SITE-A, and SITE-B is enforced by per-row `ServerCluster.ClusterId` scoping, not by mesh isolation.
## Stack
@@ -8,50 +8,48 @@ Mac-friendly multi-cluster OtOpcUa fleet for manual UI exercise + integration sm
| Service | Role | Ports |
|---|---|---|
| `sql` | SQL Server 2022 — single `OtOpcUa` ConfigDb shared by all three clusters | host `14330` → container `1433` |
| `traefik` | Routes :80 by Host header / PathPrefix | host `80`, dashboard `8089` |
| `sql` | SQL Server 2022 — single `OtOpcUa` ConfigDb shared by all nodes | host `14330` → container `1433` |
| `traefik` | Routes `:80` by PathPrefix to central admin nodes | host `80`, dashboard `8089` |
Authentication uses the **shared GLAuth** on the Linux Docker host at `10.100.0.35:3893` (baseDN `dc=zb,dc=local`). Every host container binds that instance via `cn=serviceaccount,dc=zb,dc=local`. `DevStubMode` is **not** active. Sign in as `multi-role` / `password` to get all three OtOpcUa roles (Administrator, Designer, Viewer), or use any other shared test user with password `password`. Group→role mappings are seeded by `seed/seed-clusters.sql` (`OtOpcUa-Admins`→Administrator, `OtOpcUa-Designers`→Designer, `OtOpcUa-Viewers`→Viewer). The shared GLAuth source of truth and deploy runbook live in `scadaproj/infra/glauth/`.
Authentication uses the **shared GLAuth** on the Linux Docker host at `10.100.0.35:3893` (baseDN `dc=zb,dc=local`). Only the central admin nodes authenticate users. Sign in as `multi-role` / `password` to get all three OtOpcUa roles (Administrator, Designer, Viewer), or use any other shared test user with password `password`. Group→role mappings are seeded by `seed/seed-clusters.sql` (`OtOpcUa-Admins`→Administrator, `OtOpcUa-Designers`→Designer, `OtOpcUa-Viewers`→Viewer). The shared GLAuth source of truth and deploy runbook live in `scadaproj/infra/glauth/`.
### Main cluster — split admin/driver roles
### Central nodes — fused admin+driver (MAIN cluster, UI + deploy singleton)
| Service | Role | Ports |
| Service | Roles | Ports |
|---|---|---|
| `admin-a` | `OTOPCUA_ROLES=admin`, cluster seed | internal `9000` |
| `admin-b` | `OTOPCUA_ROLES=admin`, joins admin-a | internal `9000` |
| `driver-a` | `OTOPCUA_ROLES=driver` | host `4840` → container `4840` |
| `driver-b` | `OTOPCUA_ROLES=driver` | host `4841` → container `4840` |
| `central-1` | `OTOPCUA_ROLES=admin,driver`, Akka mesh seed | host `4840` → container `4840`; internal `9000` |
| `central-2` | `OTOPCUA_ROLES=admin,driver`, joins central-1 | host `4841` → container `4840`; internal `9000` |
### Site A cluster — 2-node fused admin+driver
`central-1` and `central-2` are the **only** nodes that host the Admin UI and the deploy singleton. They are also the OPC UA publishers for the MAIN cluster. Traefik routes all `PathPrefix(/)` traffic to whichever central node has the leader role.
| Service | Role | Ports |
### Site A nodes — driver-only (SITE-A cluster)
| Service | Roles | Ports |
|---|---|---|
| `site-a-1` | `OTOPCUA_ROLES=admin,driver`, cluster seed | host `4842` → container `4840` |
| `site-a-2` | `OTOPCUA_ROLES=admin,driver`, joins site-a-1 | host `4843` → container `4840` |
| `site-a-1` | `OTOPCUA_ROLES=driver`, joins the single mesh | host `4842` → container `4840` |
| `site-a-2` | `OTOPCUA_ROLES=driver`, joins the single mesh | host `4843` → container `4840` |
### Site B cluster — 2-node fused admin+driver
### Site B nodes — driver-only (SITE-B cluster)
| Service | Role | Ports |
| Service | Roles | Ports |
|---|---|---|
| `site-b-1` | `OTOPCUA_ROLES=admin,driver`, cluster seed | host `4844` → container `4840` |
| `site-b-2` | `OTOPCUA_ROLES=admin,driver`, joins site-b-1 | host `4845` → container `4840` |
| `site-b-1` | `OTOPCUA_ROLES=driver`, joins the single mesh | host `4844` → container `4840` |
| `site-b-2` | `OTOPCUA_ROLES=driver`, joins the single mesh | host `4845` → container `4840` |
All containers bind Akka remoting to port `4053` inside their own network namespace; the `PublicHostname` of each matches its Compose service name. Akka mesh isolation is enforced purely by disjoint seed lists. Configuration-side isolation is enforced by `ServerCluster.ClusterId` — see "Multi-tenancy" below.
Site nodes serve no UI and authenticate no users. The central cluster manages and deploys to them over the shared Akka mesh. All six nodes bind Akka remoting to port `4053` inside their own network namespace; `PublicHostname` for each matches its Compose service name.
## Multi-tenancy
All eight host nodes write to the same `OtOpcUa` ConfigDb. The `ServerCluster` table differentiates the three Akka meshes: each Akka cluster maps to one row, and each `ClusterNode` row's `ClusterId` ties the runtime node back to its owning cluster scope.
All six host nodes write to the same `OtOpcUa` ConfigDb. The `ServerCluster` table differentiates the three logical clusters: each maps to one row, and each `ClusterNode` row's `ClusterId` ties the runtime node back to its owning cluster scope.
A one-shot `cluster-seed` Compose service (image `mcr.microsoft.com/mssql-tools`) waits for SQL + the EF auto-migration to complete and then INSERTs the rows below. The seed is **idempotent**`IF NOT EXISTS` guards every insert — so re-runs on `docker compose up` are no-ops:
Two one-shot Compose services bootstrap the DB on bring-up: `migrator` applies the EF Core migrations (so a fresh SQL volume gets the schema with no operator step — the host nodes deliberately do **not** auto-migrate, since production owns schema changes), then `cluster-seed` (image `mcr.microsoft.com/mssql-tools`) INSERTs the rows below. `cluster-seed` and every host node `depend_on` the `migrator` completing (`service_completed_successfully`), so the seed never races an in-progress migration. The seed is **idempotent**`IF NOT EXISTS` guards every insert — so re-runs on `docker compose up` are no-ops:
| Akka mesh | `ServerCluster.ClusterId` | `ClusterNode.NodeId` rows |
| Logical cluster | `ServerCluster.ClusterId` | `ClusterNode.NodeId` rows |
|---|---|---|
| Main | `MAIN` | `driver-a`, `driver-b` (OPC UA publishers) |
| Main | `MAIN` | `central-1`, `central-2` (OPC UA publishers + admin UI) |
| Site A | `SITE-A` | `site-a-1`, `site-a-2` |
| Site B | `SITE-B` | `site-b-1`, `site-b-2` |
`ClusterNode` is the table for **OPC UA-publishing nodes** (not every Akka cluster member), which is why the main cluster's `admin-a` / `admin-b` don't get rows — they're control-plane-only.
Each `ClusterNode.NodeId` matches the node's `Cluster__PublicHostname` env value (Compose service name) — that's the lookup the runtime uses to resolve its own membership. `ApplicationUri` follows the `urn:OtOpcUa:<NodeId>` convention.
The SQL lives at `seed/seed-clusters.sql`; the wait-and-apply wrapper lives at `seed/entrypoint.sh`. To re-seed manually:
@@ -72,21 +70,25 @@ The DriverHost actor doesn't spawn drivers from raw DriverInstance rows on its o
# from the repo root
docker compose -f docker-dev/docker-compose.yml up -d --build
# wait ~20 seconds for SQL to come up + all three clusters to form
# the one-shot migrator + cluster-seed bootstrap the DB; watch the seed finish:
docker compose -f docker-dev/docker-compose.yml logs -f cluster-seed # ^C once it prints "[cluster-seed] done."
open http://localhost # main cluster admin UI
open http://site-a.localhost # site A admin UI
open http://site-b.localhost # site B admin UI
open http://localhost:9200 # Admin UI (Traefik → central-1 or central-2)
open http://localhost:8089 # Traefik dashboard
```
On macOS, `*.localhost` resolves to `127.0.0.1` automatically. On Linux add `127.0.0.1 site-a.localhost site-b.localhost` to `/etc/hosts` if your resolver doesn't.
The first build takes a few minutes (.NET SDK image + restore + publish). Subsequent rebuilds are faster with Docker's layer cache.
The first build takes a few minutes (.NET SDK image + restore + publish). **No manual schema step is needed** — on a fresh SQL volume the one-shot `migrator` service applies the EF migrations (the host nodes deliberately don't auto-migrate, since production owns schema changes), then `cluster-seed` populates the cluster/namespace/driver rows. `cluster-seed` and the host nodes wait for the migrator via `service_completed_successfully`, so nothing races an in-progress migration. A plain `docker compose ... up -d` on an existing volume is a fast no-op for both — the named SQL volume keeps the schema + rows across restarts; only `down -v` wipes them, after which the next `up` re-migrates + re-seeds automatically.
## Auth (dev only)
All host containers authenticate against the shared GLAuth at `10.100.0.35:3893` (baseDN `dc=zb,dc=local`). `DevStubMode` is **not** active. Sign in with any test user (password `password`); `multi-role` / `password` returns all three roles (Administrator, Designer, Viewer). Group→role mappings are seeded by `seed/seed-clusters.sql`. The GLAuth source of truth + deploy runbook is in `scadaproj/infra/glauth/`. **Do not** enable `DevStubMode` outside local debugging — production must always bind a real LDAP backend.
Central nodes authenticate against the shared GLAuth at `10.100.0.35:3893` (baseDN `dc=zb,dc=local`). `DevStubMode` is **not** active. Sign in with any test user (password `password`); `multi-role` / `password` returns all three roles (Administrator, Designer, Viewer). Group→role mappings are seeded by `seed/seed-clusters.sql`. The GLAuth source of truth + deploy runbook is in `scadaproj/infra/glauth/`. **Do not** enable `DevStubMode` outside local debugging — production must always bind a real LDAP backend.
## Headless deploy
```bash
POST http://localhost:9200/api/deployments
X-Api-Key: docker-dev-deploy-key
```
## Tear down
@@ -98,15 +100,16 @@ The `-v` drops the SQL volume; remove it to keep ConfigDb state across restarts.
## Failover smoke
1. Watch the Traefik dashboard at `http://localhost:8089`. Both `admin-a` and `admin-b` should be listed as healthy in the `otopcua-admin` service.
2. `docker compose -f docker-dev/docker-compose.yml stop admin-a``admin-b` should pick up the admin role-leader within ~15 s (Akka split-brain stable-after). Traefik will route traffic to `admin-b` once its `/health/active` returns 200.
3. `docker compose -f docker-dev/docker-compose.yml start admin-a``admin-a` rejoins as a follower; `admin-b` keeps the leader role until something disturbs it.
1. Watch the Traefik dashboard at `http://localhost:8089`. Both `central-1` and `central-2` should be listed as healthy in the `otopcua-admin` service.
2. `docker compose -f docker-dev/docker-compose.yml stop central-1``central-2` should pick up the admin role-leader within ~15 s (Akka split-brain stable-after). Traefik will route traffic to `central-2` once its `/health/active` returns 200.
3. `docker compose -f docker-dev/docker-compose.yml start central-1``central-1` rejoins as a follower; `central-2` keeps the leader role until something disturbs it.
## Notes
- This compose is for the **local Mac/Linux developer rig**. The team's CI + soak runs go to the remote docker host at `10.100.0.35` (see `docs/v2/dev-environment.md`); the file there mirrors this one with adjusted port bindings.
- The OPC UA driver endpoints are reachable directly from the host (Traefik is only in front of the admin HTTP surface):
- Main: `opc.tcp://localhost:4840` (driver-a), `opc.tcp://localhost:4841` (driver-b)
- The OPC UA endpoints are reachable directly from the host (Traefik is only in front of the admin HTTP surface):
- Main: `opc.tcp://localhost:4840` (central-1), `opc.tcp://localhost:4841` (central-2)
- Site A: `opc.tcp://localhost:4842` (site-a-1), `opc.tcp://localhost:4843` (site-a-2)
- Site B: `opc.tcp://localhost:4844` (site-b-1), `opc.tcp://localhost:4845` (site-b-2)
- Galaxy + Wonderware drivers can't run in Linux containers (they need the Windows-only mxaccessgw + Historian SDK). On non-Windows, `DriverInstanceActor.ShouldStub(driverType, roles)` returns `true` for those types and the actor goes straight to a `Stubbed` state that returns deterministic success.
- SQL persistence: ConfigDb state survives container restarts (named Docker volume). Drop the volume with `down -v` for a clean slate.
+152 -192
View File
@@ -1,40 +1,46 @@
# docker-dev/ — Mac-friendly multi-cluster fleet for v2 development + manual UI exercise.
# docker-dev/ — Mac-friendly single-mesh hub-and-spoke fleet for v2 development + manual UI exercise.
#
# Stack (3 separate Akka clusters — all share the single `OtOpcUa` ConfigDb):
# sql SQL Server 2022 — hosts the one ConfigDb that all three clusters use
# ldap OpenLDAP with the dev users from C:\publish\glauth\auth.md mirrored in
# Topology: ONE Akka mesh seeded by `central-1`. Logical separation between
# tenants is by ServerCluster.ClusterId rows (MAIN / SITE-A / SITE-B) in the one
# shared `OtOpcUa` ConfigDb — NOT by separate meshes. All six host nodes join the
# same gossip ring and the central UI deploys to every cluster over it.
#
# Main cluster (existing — split-role admin / driver pair on a single Akka mesh):
# admin-a OtOpcUa.Host with OTOPCUA_ROLES=admin (seed)
# admin-b OtOpcUa.Host with OTOPCUA_ROLES=admin (joins admin-a)
# driver-a OtOpcUa.Host with OTOPCUA_ROLES=driver (joins via admin-a)
# driver-b OtOpcUa.Host with OTOPCUA_ROLES=driver (joins via admin-a)
# Stack:
# sql SQL Server 2022 — hosts the one ConfigDb every node uses
# cluster-seed one-shot mssql-tools job that INSERTs the ServerCluster +
# ClusterNode rows scoping each tenant, then exits (idempotent)
#
# Site A cluster (2-node fused admin+driver):
# site-a-1, site-a-2 OTOPCUA_ROLES=admin,driver, seed = site-a-1
# central-1, central-2 OTOPCUA_ROLES=admin,driver — the ONLY UI + deploy
# singleton, plus the MAIN cluster's OPC UA publishers.
# Reachable at http://localhost:9200 (via Traefik).
# central-1 is the Akka seed node; central-2 joins it.
# site-a-1, site-a-2 OTOPCUA_ROLES=driver — driver-only members of the same
# site-b-1, site-b-2 mesh, scoped to SITE-A / SITE-B by ClusterId. They
# serve no UI and authenticate no users; the central
# cluster manages and deploys to them over the mesh.
#
# Site B cluster (2-node fused admin+driver):
# site-b-1, site-b-2 OTOPCUA_ROLES=admin,driver, seed = site-b-1
# Auth is real LDAP against the shared GLAuth on the Linux Docker host
# (10.100.0.35:3893, dc=zb,dc=local) — there is no LDAP container here.
# Only the admin-role central nodes carry the Security__Ldap__* block.
# Sign in `multi-role` / `password`.
#
# traefik PathPrefix → main cluster admin-a/admin-b; Host(`site-a.localhost`) →
# site-a-*; Host(`site-b.localhost`) → site-b-*. Add the two site hosts to
# your /etc/hosts (or rely on macOS `.localhost` auto-resolution).
# traefik PathPrefix(`/`) → central-1 / central-2 (the single UI route).
#
# Multi-tenancy: ConfigDb is one schema with a `ServerCluster` table; each Akka cluster
# corresponds to a row in it (ClusterId = "MAIN" / "SITE-A" / "SITE-B"), and each node's
# `ClusterNode.NodeId` points back at the row that owns it. After first boot, sign in to
# any cluster's Admin UI and create the matching ServerCluster + ClusterNode rows via
# /clusters and /hosts so the runtime knows what configuration scope applies.
# OPC UA endpoints (host-side port → container 4840):
# central-1 :4840 central-2 :4841
# site-a-1 :4842 site-a-2 :4843
# site-b-1 :4844 site-b-2 :4845
#
# Akka mesh isolation: same system name "otopcua" + same remoting port 4053 inside each
# container's own network namespace, but with disjoint seed-node lists — gossip never
# crosses between the three meshes.
# Headless deploy: POST http://localhost:9200/api/deployments with the
# X-Api-Key header (Security__DeployApiKey = "docker-dev-deploy-key").
#
# SQL persistence: the otopcua-mssql-data named volume keeps the ConfigDb schema
# + seeded clusters across `docker compose up` cycles; without it a recreate
# silently drops the OtOpcUa database.
#
# Usage:
# docker compose -f docker-dev/docker-compose.yml up -d --build
# open http://localhost # main cluster Blazor admin UI
# open http://site-a.localhost # site A admin UI
# open http://site-b.localhost # site B admin UI
# open http://localhost:9200 # central Blazor admin UI
# open http://localhost:8089 # Traefik dashboard (8080 is the sister scadalink stack)
#
# Tear-down: docker compose -f docker-dev/docker-compose.yml down -v
@@ -64,129 +70,68 @@ services:
timeout: 5s
retries: 20
# ── Cluster seed (one-shot) ────────────────────────────────────────────────
# Waits for SQL + the host containers' EF auto-migration, then INSERTs the
# three ServerCluster rows and the six ClusterNode rows that scope each Akka
# mesh inside the shared OtOpcUa ConfigDb. Idempotent — re-runs are no-ops.
cluster-seed:
image: mcr.microsoft.com/mssql-tools:latest
# ── Migrator (one-shot) ────────────────────────────────────────────────────
# Applies EF Core migrations to the OtOpcUa ConfigDb so a fresh SQL volume gets
# the schema with no operator step (the host nodes deliberately don't auto-
# migrate — production owns schema changes). cluster-seed + every host node
# depend on this completing, so nothing races an in-progress migration.
# Idempotent: a no-op once the schema is current.
migrator:
build:
context: ..
dockerfile: docker-dev/Dockerfile
target: migrator
image: otopcua-migrator:dev
depends_on:
sql:
condition: service_healthy
environment:
OTOPCUA_CONFIG_CONNECTION: "Server=sql,1433;Database=OtOpcUa;User Id=sa;Password=OtOpcUa!Dev123;TrustServerCertificate=True;"
restart: "no"
# ── Cluster seed (one-shot) ────────────────────────────────────────────────
# Runs only after `migrator` completes (so the schema is final — no race), then
# INSERTs the three ServerCluster rows and the six ClusterNode rows that scope
# each tenant inside the shared OtOpcUa ConfigDb. Idempotent — re-runs are no-ops.
cluster-seed:
image: mcr.microsoft.com/mssql-tools:latest
depends_on:
migrator:
condition: service_completed_successfully
volumes:
- ./seed:/seed:ro
entrypoint: ["/bin/bash", "/seed/entrypoint.sh"]
restart: "no"
# OpenLDAP was previously here but the bitnami/openldap:2.6 image was retired
# (manifest gone) and bitnamilegacy/openldap:2.6 crashes during LDIF setup with
# exit 68. For the dev compose every host container now runs with
# Security__Ldap__DevStubMode=true, so any non-empty username/password
# signs in as `Administrator`. Restore a real LDAP service when there's a need
# for end-to-end LDAP coverage (the host code path is unchanged).
# A local OpenLDAP container used to live here, but the bitnami/openldap:2.6
# image was retired (manifest gone) and bitnamilegacy/openldap:2.6 crashes
# during LDIF setup (exit 68). Rather than stub auth, the central (admin-role)
# containers bind the shared GLAuth on the Linux Docker host (Security__Ldap__*
# below: 10.100.0.35:3893, dc=zb,dc=local, DevStubMode=false) — so dev auth
# exercises the real LDAP bind + group→role path. Sign in `multi-role` /
# `password` (all roles) or any shared test user / `password`.
admin-a: &otopcua-host
# ── Central cluster (2-node fused admin+driver) ─────────────────────────────
# The only UI + deploy singleton; also the MAIN cluster's OPC UA publishers.
# central-1 seeds the single Akka mesh that every other node joins.
central-1: &otopcua-host
build:
context: ..
dockerfile: docker-dev/Dockerfile
target: runtime
image: otopcua-host:dev
depends_on:
sql: { condition: service_healthy }
environment:
OTOPCUA_ROLES: "admin"
ASPNETCORE_URLS: "http://+:9000"
ConnectionStrings__ConfigDb: "Server=sql,1433;Database=OtOpcUa;User Id=sa;Password=OtOpcUa!Dev123;TrustServerCertificate=True;"
Cluster__Hostname: "0.0.0.0"
Cluster__Port: "4053"
Cluster__PublicHostname: "admin-a"
Cluster__SeedNodes__0: "akka.tcp://otopcua@admin-a:4053"
Cluster__Roles__0: "admin"
Security__Jwt__SigningKey: "docker-dev-signing-key-with-at-least-32-bytes-of-utf8-content-12345"
Security__Jwt__Issuer: "otopcua-dev"
Security__Jwt__Audience: "otopcua-dev"
Security__Ldap__Enabled: "true"
Security__Ldap__DevStubMode: "false"
Security__Ldap__Server: "10.100.0.35"
Security__Ldap__Port: "3893"
Security__Ldap__Transport: "None"
Security__Ldap__AllowInsecure: "true"
Security__Ldap__SearchBase: "dc=zb,dc=local"
Security__Ldap__ServiceAccountDn: "cn=serviceaccount,dc=zb,dc=local"
Security__Ldap__ServiceAccountPassword: "serviceaccount123"
Security__DeployApiKey: "docker-dev-deploy-key"
GALAXY_MXGW_API_KEY: "${GALAXY_MXGW_API_KEY:-mxgw_otopcua2_GI7-tNozYE6cXGUSgEzL3AHDV7bYcYIHdMwKYgyHdX4}"
admin-b:
<<: *otopcua-host
environment:
OTOPCUA_ROLES: "admin"
ASPNETCORE_URLS: "http://+:9000"
ConnectionStrings__ConfigDb: "Server=sql,1433;Database=OtOpcUa;User Id=sa;Password=OtOpcUa!Dev123;TrustServerCertificate=True;"
Cluster__Hostname: "0.0.0.0"
Cluster__Port: "4053"
Cluster__PublicHostname: "admin-b"
Cluster__SeedNodes__0: "akka.tcp://otopcua@admin-a:4053"
Cluster__Roles__0: "admin"
Security__Jwt__SigningKey: "docker-dev-signing-key-with-at-least-32-bytes-of-utf8-content-12345"
Security__Jwt__Issuer: "otopcua-dev"
Security__Jwt__Audience: "otopcua-dev"
Security__Ldap__Enabled: "true"
Security__Ldap__DevStubMode: "false"
Security__Ldap__Server: "10.100.0.35"
Security__Ldap__Port: "3893"
Security__Ldap__Transport: "None"
Security__Ldap__AllowInsecure: "true"
Security__Ldap__SearchBase: "dc=zb,dc=local"
Security__Ldap__ServiceAccountDn: "cn=serviceaccount,dc=zb,dc=local"
Security__Ldap__ServiceAccountPassword: "serviceaccount123"
Security__DeployApiKey: "docker-dev-deploy-key"
GALAXY_MXGW_API_KEY: "${GALAXY_MXGW_API_KEY:-mxgw_otopcua2_GI7-tNozYE6cXGUSgEzL3AHDV7bYcYIHdMwKYgyHdX4}"
driver-a:
<<: *otopcua-host
environment:
OTOPCUA_ROLES: "driver"
ConnectionStrings__ConfigDb: "Server=sql,1433;Database=OtOpcUa;User Id=sa;Password=OtOpcUa!Dev123;TrustServerCertificate=True;"
Cluster__Hostname: "0.0.0.0"
Cluster__Port: "4053"
Cluster__PublicHostname: "driver-a"
Cluster__SeedNodes__0: "akka.tcp://otopcua@admin-a:4053"
Cluster__Roles__0: "driver"
# Resolved at runtime by GalaxyDriver.ResolveApiKey when a DriverInstance's
# Gateway.ApiKeySecretRef = "env:GALAXY_MXGW_API_KEY".
GALAXY_MXGW_API_KEY: "${GALAXY_MXGW_API_KEY:-mxgw_otopcua2_GI7-tNozYE6cXGUSgEzL3AHDV7bYcYIHdMwKYgyHdX4}"
ports:
- "4840:4840"
driver-b:
<<: *otopcua-host
environment:
OTOPCUA_ROLES: "driver"
ConnectionStrings__ConfigDb: "Server=sql,1433;Database=OtOpcUa;User Id=sa;Password=OtOpcUa!Dev123;TrustServerCertificate=True;"
Cluster__Hostname: "0.0.0.0"
Cluster__Port: "4053"
Cluster__PublicHostname: "driver-b"
Cluster__SeedNodes__0: "akka.tcp://otopcua@admin-a:4053"
Cluster__Roles__0: "driver"
GALAXY_MXGW_API_KEY: "${GALAXY_MXGW_API_KEY:-mxgw_otopcua2_GI7-tNozYE6cXGUSgEzL3AHDV7bYcYIHdMwKYgyHdX4}"
ports:
- "4841:4840"
# ── Site A cluster (2-node fused admin+driver) ──────────────────────────────
# Shares the OtOpcUa ConfigDb with the main + site-b clusters; multi-tenancy is
# enforced by ServerCluster.ClusterId rows (configure via /clusters after boot).
# Akka isolation comes from the disjoint seed list (seed = site-a-1).
site-a-1:
<<: *otopcua-host
migrator: { condition: service_completed_successfully }
environment:
OTOPCUA_ROLES: "admin,driver"
ASPNETCORE_URLS: "http://+:9000"
ConnectionStrings__ConfigDb: "Server=sql,1433;Database=OtOpcUa;User Id=sa;Password=OtOpcUa!Dev123;TrustServerCertificate=True;"
Cluster__Hostname: "0.0.0.0"
Cluster__Port: "4053"
Cluster__PublicHostname: "site-a-1"
Cluster__SeedNodes__0: "akka.tcp://otopcua@site-a-1:4053"
Cluster__PublicHostname: "central-1"
Cluster__SeedNodes__0: "akka.tcp://otopcua@central-1:4053"
Cluster__Roles__0: "admin"
Cluster__Roles__1: "driver"
Security__Jwt__SigningKey: "docker-dev-signing-key-with-at-least-32-bytes-of-utf8-content-12345"
@@ -203,6 +148,64 @@ services:
Security__Ldap__ServiceAccountPassword: "serviceaccount123"
Security__DeployApiKey: "docker-dev-deploy-key"
GALAXY_MXGW_API_KEY: "${GALAXY_MXGW_API_KEY:-mxgw_otopcua2_GI7-tNozYE6cXGUSgEzL3AHDV7bYcYIHdMwKYgyHdX4}"
ports:
- "4840:4840"
central-2:
<<: *otopcua-host
depends_on:
sql: { condition: service_healthy }
central-1: { condition: service_started }
migrator: { condition: service_completed_successfully }
environment:
OTOPCUA_ROLES: "admin,driver"
ASPNETCORE_URLS: "http://+:9000"
ConnectionStrings__ConfigDb: "Server=sql,1433;Database=OtOpcUa;User Id=sa;Password=OtOpcUa!Dev123;TrustServerCertificate=True;"
Cluster__Hostname: "0.0.0.0"
Cluster__Port: "4053"
Cluster__PublicHostname: "central-2"
Cluster__SeedNodes__0: "akka.tcp://otopcua@central-1:4053"
Cluster__Roles__0: "admin"
Cluster__Roles__1: "driver"
Security__Jwt__SigningKey: "docker-dev-signing-key-with-at-least-32-bytes-of-utf8-content-12345"
Security__Jwt__Issuer: "otopcua-dev"
Security__Jwt__Audience: "otopcua-dev"
Security__Ldap__Enabled: "true"
Security__Ldap__DevStubMode: "false"
Security__Ldap__Server: "10.100.0.35"
Security__Ldap__Port: "3893"
Security__Ldap__Transport: "None"
Security__Ldap__AllowInsecure: "true"
Security__Ldap__SearchBase: "dc=zb,dc=local"
Security__Ldap__ServiceAccountDn: "cn=serviceaccount,dc=zb,dc=local"
Security__Ldap__ServiceAccountPassword: "serviceaccount123"
Security__DeployApiKey: "docker-dev-deploy-key"
GALAXY_MXGW_API_KEY: "${GALAXY_MXGW_API_KEY:-mxgw_otopcua2_GI7-tNozYE6cXGUSgEzL3AHDV7bYcYIHdMwKYgyHdX4}"
ports:
- "4841:4840"
# ── Site A cluster (2-node driver-only) ─────────────────────────────────────
# Driver-only members of the single mesh, scoped to SITE-A by ClusterId. No UI,
# no user auth — managed + deployed to by the central cluster over the mesh.
# All site nodes seed central-1.
site-a-1:
<<: *otopcua-host
depends_on:
sql: { condition: service_healthy }
central-1: { condition: service_started }
migrator: { condition: service_completed_successfully }
environment:
OTOPCUA_ROLES: "driver"
ConnectionStrings__ConfigDb: "Server=sql,1433;Database=OtOpcUa;User Id=sa;Password=OtOpcUa!Dev123;TrustServerCertificate=True;"
Cluster__Hostname: "0.0.0.0"
Cluster__Port: "4053"
Cluster__PublicHostname: "site-a-1"
Cluster__SeedNodes__0: "akka.tcp://otopcua@central-1:4053"
Cluster__Roles__0: "driver"
# Resolved at runtime by GalaxyDriver.ResolveApiKey when a DriverInstance's
# Gateway.ApiKeySecretRef = "env:GALAXY_MXGW_API_KEY".
GALAXY_MXGW_API_KEY: "${GALAXY_MXGW_API_KEY:-mxgw_otopcua2_GI7-tNozYE6cXGUSgEzL3AHDV7bYcYIHdMwKYgyHdX4}"
ports:
- "4842:4840"
@@ -210,61 +213,36 @@ services:
<<: *otopcua-host
depends_on:
sql: { condition: service_healthy }
site-a-1: { condition: service_started }
central-1: { condition: service_started }
migrator: { condition: service_completed_successfully }
environment:
OTOPCUA_ROLES: "admin,driver"
ASPNETCORE_URLS: "http://+:9000"
OTOPCUA_ROLES: "driver"
ConnectionStrings__ConfigDb: "Server=sql,1433;Database=OtOpcUa;User Id=sa;Password=OtOpcUa!Dev123;TrustServerCertificate=True;"
Cluster__Hostname: "0.0.0.0"
Cluster__Port: "4053"
Cluster__PublicHostname: "site-a-2"
Cluster__SeedNodes__0: "akka.tcp://otopcua@site-a-1:4053"
Cluster__Roles__0: "admin"
Cluster__Roles__1: "driver"
Security__Jwt__SigningKey: "docker-dev-signing-key-with-at-least-32-bytes-of-utf8-content-12345"
Security__Jwt__Issuer: "otopcua-dev"
Security__Jwt__Audience: "otopcua-dev"
Security__Ldap__Enabled: "true"
Security__Ldap__DevStubMode: "false"
Security__Ldap__Server: "10.100.0.35"
Security__Ldap__Port: "3893"
Security__Ldap__Transport: "None"
Security__Ldap__AllowInsecure: "true"
Security__Ldap__SearchBase: "dc=zb,dc=local"
Security__Ldap__ServiceAccountDn: "cn=serviceaccount,dc=zb,dc=local"
Security__Ldap__ServiceAccountPassword: "serviceaccount123"
Security__DeployApiKey: "docker-dev-deploy-key"
Cluster__SeedNodes__0: "akka.tcp://otopcua@central-1:4053"
Cluster__Roles__0: "driver"
GALAXY_MXGW_API_KEY: "${GALAXY_MXGW_API_KEY:-mxgw_otopcua2_GI7-tNozYE6cXGUSgEzL3AHDV7bYcYIHdMwKYgyHdX4}"
ports:
- "4843:4840"
# ── Site B cluster (2-node fused admin+driver) ──────────────────────────────
# ── Site B cluster (2-node driver-only) ─────────────────────────────────────
site-b-1:
<<: *otopcua-host
depends_on:
sql: { condition: service_healthy }
central-1: { condition: service_started }
migrator: { condition: service_completed_successfully }
environment:
OTOPCUA_ROLES: "admin,driver"
ASPNETCORE_URLS: "http://+:9000"
OTOPCUA_ROLES: "driver"
ConnectionStrings__ConfigDb: "Server=sql,1433;Database=OtOpcUa;User Id=sa;Password=OtOpcUa!Dev123;TrustServerCertificate=True;"
Cluster__Hostname: "0.0.0.0"
Cluster__Port: "4053"
Cluster__PublicHostname: "site-b-1"
Cluster__SeedNodes__0: "akka.tcp://otopcua@site-b-1:4053"
Cluster__Roles__0: "admin"
Cluster__Roles__1: "driver"
Security__Jwt__SigningKey: "docker-dev-signing-key-with-at-least-32-bytes-of-utf8-content-12345"
Security__Jwt__Issuer: "otopcua-dev"
Security__Jwt__Audience: "otopcua-dev"
Security__Ldap__Enabled: "true"
Security__Ldap__DevStubMode: "false"
Security__Ldap__Server: "10.100.0.35"
Security__Ldap__Port: "3893"
Security__Ldap__Transport: "None"
Security__Ldap__AllowInsecure: "true"
Security__Ldap__SearchBase: "dc=zb,dc=local"
Security__Ldap__ServiceAccountDn: "cn=serviceaccount,dc=zb,dc=local"
Security__Ldap__ServiceAccountPassword: "serviceaccount123"
Security__DeployApiKey: "docker-dev-deploy-key"
Cluster__SeedNodes__0: "akka.tcp://otopcua@central-1:4053"
Cluster__Roles__0: "driver"
GALAXY_MXGW_API_KEY: "${GALAXY_MXGW_API_KEY:-mxgw_otopcua2_GI7-tNozYE6cXGUSgEzL3AHDV7bYcYIHdMwKYgyHdX4}"
ports:
- "4844:4840"
@@ -273,30 +251,16 @@ services:
<<: *otopcua-host
depends_on:
sql: { condition: service_healthy }
site-b-1: { condition: service_started }
central-1: { condition: service_started }
migrator: { condition: service_completed_successfully }
environment:
OTOPCUA_ROLES: "admin,driver"
ASPNETCORE_URLS: "http://+:9000"
OTOPCUA_ROLES: "driver"
ConnectionStrings__ConfigDb: "Server=sql,1433;Database=OtOpcUa;User Id=sa;Password=OtOpcUa!Dev123;TrustServerCertificate=True;"
Cluster__Hostname: "0.0.0.0"
Cluster__Port: "4053"
Cluster__PublicHostname: "site-b-2"
Cluster__SeedNodes__0: "akka.tcp://otopcua@site-b-1:4053"
Cluster__Roles__0: "admin"
Cluster__Roles__1: "driver"
Security__Jwt__SigningKey: "docker-dev-signing-key-with-at-least-32-bytes-of-utf8-content-12345"
Security__Jwt__Issuer: "otopcua-dev"
Security__Jwt__Audience: "otopcua-dev"
Security__Ldap__Enabled: "true"
Security__Ldap__DevStubMode: "false"
Security__Ldap__Server: "10.100.0.35"
Security__Ldap__Port: "3893"
Security__Ldap__Transport: "None"
Security__Ldap__AllowInsecure: "true"
Security__Ldap__SearchBase: "dc=zb,dc=local"
Security__Ldap__ServiceAccountDn: "cn=serviceaccount,dc=zb,dc=local"
Security__Ldap__ServiceAccountPassword: "serviceaccount123"
Security__DeployApiKey: "docker-dev-deploy-key"
Cluster__SeedNodes__0: "akka.tcp://otopcua@central-1:4053"
Cluster__Roles__0: "driver"
GALAXY_MXGW_API_KEY: "${GALAXY_MXGW_API_KEY:-mxgw_otopcua2_GI7-tNozYE6cXGUSgEzL3AHDV7bYcYIHdMwKYgyHdX4}"
ports:
- "4845:4840"
@@ -314,12 +278,8 @@ services:
volumes:
- ./traefik-dynamic.yml:/etc/traefik/dynamic.yml:ro
depends_on:
- admin-a
- admin-b
- site-a-1
- site-a-2
- site-b-1
- site-b-2
- central-1
- central-2
volumes:
# SQL Server data dir — persists the OtOpcUa ConfigDb across container recreates.
+9 -16
View File
@@ -1,20 +1,13 @@
#!/usr/bin/env bash
# docker-dev cluster-seed entrypoint. Waits for the OtOpcUa ConfigDb schema to
# be in place, then applies the idempotent row seed.
# docker-dev cluster-seed entrypoint. Applies the idempotent row seed.
#
# IMPORTANT: this container does NOT run EF migrations — sqlcmd can't execute
# the V2 migration script cleanly because it contains CREATE PROCEDURE
# statements inside IF NOT EXISTS BEGIN ... END blocks (procs must be the
# first statement in their batch). Migrations are owned by the operator:
#
# dotnet ef database update \
# --project src/Core/ZB.MOM.WW.OtOpcUa.Configuration \
# --startup-project src/Server/ZB.MOM.WW.OtOpcUa.Host
#
# (with ConnectionStrings__ConfigDb pointing at Server=localhost,14330;...).
# Once the schema is in place, restart the cluster-seed container — or just
# `docker compose up -d` and the seed will pick up where it left off thanks to
# the IF NOT EXISTS guards in seed-clusters.sql.
# This container does NOT run EF migrations — sqlcmd can't execute the migration
# script cleanly (it has CREATE PROCEDURE inside IF NOT EXISTS BEGIN ... END
# blocks; procs must be the first statement in their batch). The schema is owned
# by the `migrator` Compose service (dotnet ef), which this seed depends on via
# `service_completed_successfully` — so by the time we run, migrations are fully
# applied. The dbo.ServerCluster wait below is therefore just a fast sanity check.
# Re-runs are safe: every insert in seed-clusters.sql is IF NOT EXISTS-guarded.
set -euo pipefail
@@ -37,7 +30,7 @@ until run_sql_in master -Q "SELECT 1" >/dev/null 2>&1; do
done
echo "[cluster-seed] SQL Server up."
echo "[cluster-seed] waiting for ${DB} database + dbo.ServerCluster table (operator must run dotnet ef database update)..."
echo "[cluster-seed] verifying ${DB} schema (dbo.ServerCluster) is present (migrator should have applied it)..."
until run_sql_in "$DB" -Q "IF OBJECT_ID('dbo.ServerCluster') IS NULL THROW 50001, 'missing', 1; SELECT 1" >/dev/null 2>&1; do
sleep 3
done
+11 -11
View File
@@ -2,9 +2,9 @@
--
-- Populates:
-- ServerCluster MAIN, SITE-A, SITE-B
-- ClusterNode driver-a, driver-b → MAIN
-- site-a-1, site-a-2 → SITE-A
-- site-b-1, site-b-2 → SITE-B
-- ClusterNode central-1, central-2 → MAIN
-- site-a-1, site-a-2 → SITE-A
-- site-b-1, site-b-2 → SITE-B
--
-- ServerCluster.NodeCount + RedundancyMode are coupled by CHECK constraint:
-- NodeCount=1 ⇒ RedundancyMode='None'
@@ -32,7 +32,7 @@ IF NOT EXISTS (SELECT 1 FROM dbo.ServerCluster WHERE ClusterId = 'MAIN')
VALUES
('MAIN', 'Main cluster', 'zb', 'docker-dev',
2, 'Warm', 1,
'docker-dev seed — admin-a/admin-b control-plane, driver-a/driver-b OPC UA publishers.',
'docker-dev seed — central-1/central-2 fused admin+driver: UI + deploy singleton + MAIN OPC UA publishers.',
'docker-dev-seed');
IF NOT EXISTS (SELECT 1 FROM dbo.ServerCluster WHERE ClusterId = 'SITE-A')
@@ -41,7 +41,7 @@ IF NOT EXISTS (SELECT 1 FROM dbo.ServerCluster WHERE ClusterId = 'SITE-A')
VALUES
('SITE-A', 'Site A', 'zb', 'site-a',
2, 'Warm', 1,
'docker-dev seed — 2-node fused admin+driver cluster.',
'docker-dev seed — 2-node driver-only, managed by the central cluster over the shared mesh (empty until configured).',
'docker-dev-seed');
IF NOT EXISTS (SELECT 1 FROM dbo.ServerCluster WHERE ClusterId = 'SITE-B')
@@ -50,11 +50,11 @@ IF NOT EXISTS (SELECT 1 FROM dbo.ServerCluster WHERE ClusterId = 'SITE-B')
VALUES
('SITE-B', 'Site B', 'zb', 'site-b',
2, 'Warm', 1,
'docker-dev seed — 2-node fused admin+driver cluster.',
'docker-dev seed — 2-node driver-only, managed by the central cluster over the shared mesh (empty until configured).',
'docker-dev-seed');
------------------------------------------------------------------------------
-- ClusterNode — main cluster OPC UA publishers
-- ClusterNode — central cluster (MAIN UI + deploy singleton + OPC UA publishers)
--
-- NodeId is "<compose-service>:4053" so it matches what ClusterRoleInfo +
-- ConfigPublishCoordinator derive from Akka.Cluster.Get(system).State.Members
@@ -62,15 +62,15 @@ IF NOT EXISTS (SELECT 1 FROM dbo.ServerCluster WHERE ClusterId = 'SITE-B')
-- ClusterNode.NodeId; mismatched values cause FK 547 on deploy.
------------------------------------------------------------------------------
IF NOT EXISTS (SELECT 1 FROM dbo.ClusterNode WHERE NodeId = 'driver-a:4053')
IF NOT EXISTS (SELECT 1 FROM dbo.ClusterNode WHERE NodeId = 'central-1:4053')
INSERT INTO dbo.ClusterNode
(NodeId, ClusterId, Host, OpcUaPort, DashboardPort, ApplicationUri, ServiceLevelBase, Enabled, CreatedBy)
VALUES ('driver-a:4053', 'MAIN', 'driver-a', 4840, 8081, 'urn:OtOpcUa:driver-a', 200, 1, 'docker-dev-seed');
VALUES ('central-1:4053', 'MAIN', 'central-1', 4840, 8081, 'urn:OtOpcUa:central-1', 200, 1, 'docker-dev-seed');
IF NOT EXISTS (SELECT 1 FROM dbo.ClusterNode WHERE NodeId = 'driver-b:4053')
IF NOT EXISTS (SELECT 1 FROM dbo.ClusterNode WHERE NodeId = 'central-2:4053')
INSERT INTO dbo.ClusterNode
(NodeId, ClusterId, Host, OpcUaPort, DashboardPort, ApplicationUri, ServiceLevelBase, Enabled, CreatedBy)
VALUES ('driver-b:4053', 'MAIN', 'driver-b', 4840, 8081, 'urn:OtOpcUa:driver-b', 150, 1, 'docker-dev-seed');
VALUES ('central-2:4053', 'MAIN', 'central-2', 4840, 8081, 'urn:OtOpcUa:central-2', 150, 1, 'docker-dev-seed');
------------------------------------------------------------------------------
-- ClusterNode — site A
+7 -55
View File
@@ -1,12 +1,10 @@
# docker-dev companion to scripts/install/traefik-dynamic.yml. Routes three
# Akka clusters that share the Compose network:
# docker-dev companion to scripts/install/traefik-dynamic.yml. Routes the single
# central cluster UI on the shared Compose network:
#
# - Main cluster (default): PathPrefix(`/`) → admin-a / admin-b.
# - Site A cluster: Host(`site-a.localhost`) → site-a-1 / site-a-2.
# - Site B cluster: Host(`site-b.localhost`) → site-b-1 / site-b-2.
# - Central UI (only route): PathPrefix(`/`) → central-1 / central-2.
#
# Host-header rules are more specific than PathPrefix, so they win over the
# default router for the site hostnames automatically — no priority field needed.
# The driver-only site nodes serve no UI, so they have no Traefik route — the
# central cluster manages and deploys to them over the shared Akka mesh.
http:
routers:
@@ -15,16 +13,6 @@ http:
rule: "PathPrefix(`/`)"
service: otopcua-admin
otopcua-site-a:
entryPoints: ["web"]
rule: "Host(`site-a.localhost`)"
service: otopcua-site-a
otopcua-site-b:
entryPoints: ["web"]
rule: "Host(`site-b.localhost`)"
service: otopcua-site-b
services:
otopcua-admin:
loadBalancer:
@@ -37,44 +25,8 @@ http:
httpOnly: true
sameSite: lax
servers:
- url: "http://admin-a:9000"
- url: "http://admin-b:9000"
healthCheck:
path: /health/active
interval: 5s
timeout: 2s
otopcua-site-a:
loadBalancer:
# Blazor Server uses SignalR; the WebSocket upgrade must hit the same
# backend that owns the circuit ID. Sticky cookie keeps each session
# pinned to one node so the post-handshake WebSocket doesn't 404.
sticky:
cookie:
name: otopcua_lb
httpOnly: true
sameSite: lax
servers:
- url: "http://site-a-1:9000"
- url: "http://site-a-2:9000"
healthCheck:
path: /health/active
interval: 5s
timeout: 2s
otopcua-site-b:
loadBalancer:
# Blazor Server uses SignalR; the WebSocket upgrade must hit the same
# backend that owns the circuit ID. Sticky cookie keeps each session
# pinned to one node so the post-handshake WebSocket doesn't 404.
sticky:
cookie:
name: otopcua_lb
httpOnly: true
sameSite: lax
servers:
- url: "http://site-b-1:9000"
- url: "http://site-b-2:9000"
- url: "http://central-1:9000"
- url: "http://central-2:9000"
healthCheck:
path: /health/active
interval: 5s
@@ -0,0 +1,193 @@
# Per-ClusterId Scoping (hub-and-spoke single mesh) — Design
**Date:** 2026-06-07
**Status:** Approved (brainstorming complete; next step: writing-plans)
**Branch:** `feat/per-cluster-scoping`
## Goal
Let one **central** cluster's Admin UI manage and deploy to multiple
logically-separate clusters that share a single Akka mesh. The central cluster
runs 2 fused `admin,driver` nodes (the only UI + the only deploy singleton);
each site cluster runs 2 `driver`-only nodes. A single global deploy from the
central UI reaches every node, and **each node applies only the slice of the
configuration that belongs to its own `ClusterId`** — its drivers and its OPC UA
address space. Ship global deploy first; per-cluster deploy is a later follow-up.
## Why this needs runtime work
The deploy channel is **in-mesh**: AdminUI → `admin-operations` singleton →
`ConfigPublishCoordinator` → DistributedPubSub → driver nodes. DistributedPubSub
does not cross Akka mesh boundaries, so for the central UI to deploy to site
servers the site nodes **must join the central mesh**. But the runtime currently
assumes **one Akka mesh == one logical cluster**:
- `DriverHostActor.ReconcileDrivers` spawns **every** `DriverInstance` in the
artifact with no cluster filter (`DriverHostActor.cs:367`). The `ClusterId` on
a spec is used only to *label* health snapshots.
- `ConfigPublishCoordinator.DiscoverDriverNodes` broadcasts to **every** driver
member of the mesh, no `ClusterId` filter (`ConfigPublishCoordinator.cs:248`).
- `ConfigComposer.SnapshotAndFlattenAsync` snapshots **all** clusters' rows into
one flat artifact; the address space is built from the whole thing.
Consequence today: put MAIN + SITE-A + SITE-B nodes in one mesh and every node
spawns every cluster's drivers (Galaxy auto-stubs on Linux, so it *would* start)
and serves a **merged** address space of all three clusters. That is why the
existing docker-dev rig uses three isolated meshes.
This design adds the missing per-`ClusterId` scoping so a shared mesh behaves as
distinct logical clusters.
## Approach (chosen: A — node-side, parse-time filter, ClusterId from the artifact)
Each node resolves *its own* `ClusterId` by finding its `NodeId`
(`_localNode.Value`, format `"host:port"`, e.g. `central-1:4053`) in the
artifact's `ClusterNode` rows, then filters both the driver specs and the
address-space composition to that cluster.
The artifact is a self-contained, consistent snapshot that already includes
`ClusterNode` + `DriverInstance` + `Namespace` + `UnsArea` (all carrying
`ClusterId`), so resolution needs **no extra DB query** and has no
seal-vs-apply inconsistency window. The coordinator stays a **single broadcast**;
every node just applies its own slice.
### Alternatives considered
- **B — control-plane per-node artifact slices.** `ConfigComposer` emits a
filtered artifact per cluster and the coordinator dispatches the right slice to
each node. Rejected: turns one broadcast into per-cluster dispatch (a large
change to the deploy/ack model), contradicts "ship global first," and still
needs the same transitive `ClusterId` resolution.
- **C — runtime DB lookup for ClusterId.** Node queries `ClusterNode` by its
address at apply time, then filters post-parse. Rejected: extra DB round-trip
per node per deploy and a seal-vs-apply inconsistency window; the artifact
already contains everything A needs.
## Components
### 1. Self-`ClusterId` resolution
New helper `DeploymentArtifact.ParseClusterScope(blob, nodeId)` returning
`(string? ClusterId, int ClusterCount)`:
- `ClusterId` = the `ClusterNode` row whose `NodeId == nodeId`, else `null`.
- `ClusterCount` = number of `ServerCluster` rows in the artifact.
Both `DriverHostActor` and `OpcUaPublishActor` call it with `_localNode.Value`.
**Fallback rule (single source of truth for every filter site):**
| Condition | Behavior |
|---|---|
| `ClusterCount ≤ 1` | **Lenient — no filter** (legacy single-cluster meshes + the entire existing test suite behave exactly as today). |
| `ClusterCount > 1` and `ClusterId` resolved | **Filter to my cluster.** |
| `ClusterCount > 1` and `ClusterId` unresolved | **Apply nothing + log error** (a node in a multi-cluster mesh with no `ClusterNode` row is misconfigured; serving everything would leak other clusters' data). |
The `ClusterCount ≤ 1` lenient branch is what protects the existing ~210 v2
tests and any single-cluster deployment from any behavior change.
### 2. Driver-spawn filter — `DriverHostActor`
`DriverInstanceSpec` already carries `ClusterId`, so in `ReconcileDrivers` (and
the restart `RestoreServedState` path) apply a one-line predicate over the parsed
specs using the fallback rule. In multi-cluster mode, specs with a `null`
`ClusterId` are excluded + logged (should never occur — `ConfigComposer` always
serializes the column).
### 3. Address-space filter — `ParseComposition` + `OpcUaPublishActor`
Add `DeploymentArtifact.ParseComposition(blob, clusterId)`. At parse time the raw
artifact entities still carry `ClusterId` / `NamespaceId` / `UnsAreaId` /
`DriverInstanceId`, so build in-cluster id sets from the artifact and filter every
projection:
| Projection | Filter predicate |
|---|---|
| `UnsAreas` | `ClusterId == mine` (direct) |
| `UnsLines` | `UnsAreaId ∈ myAreas` |
| `EquipmentNodes` | `DriverInstanceId ∈ myDrivers` |
| `DriverInstancePlans` | `DriverInstanceId ∈ myDrivers` |
| `GalaxyTags` / `EquipmentTags` | `DriverInstanceId ∈ myDrivers` |
| `ScriptedAlarmPlans` | `EquipmentId ∈ myEquipment` |
`OpcUaPublishActor.HandleRebuild` resolves `myClusterId` and calls the filtered
parse before `Phase7Planner.Compute`. `_lastApplied` becomes the filtered
composition, so the incremental diff stays correct across redeploys. The no-arg
`ParseComposition(blob)` is left untouched (legacy / single-cluster path).
### 4. Deploy ack / convergence
`ConfigPublishCoordinator` keeps broadcasting to all driver members and waiting
for all acks (in the new rig all 6 nodes are driver-role). Each node applies its
slice and acks — **including a node whose cluster has an empty slice**. The one
risk: the ack must fire even when the node's plan is empty. Implementation will
**verify the ack is unconditional** and add a small fix if it is currently gated
on a non-empty change set. No change to `DiscoverDriverNodes`.
### 5. docker-dev compose + seed rewrite
- **compose:** remove `admin-a` / `admin-b` / `driver-a` / `driver-b`; add
`central-1` / `central-2` (`OTOPCUA_ROLES=admin,driver`, seed = `central-1`,
OPC UA `4840` / `4841`, ASPNETCORE UI on `:9000`). `site-a-1/2`, `site-b-1/2`
become `driver`-only (`OTOPCUA_ROLES=driver`, `Cluster__Roles__0=driver`, seed
`central-1`, OPC UA `4842``4845`), dropping their UI / Jwt / Ldap /
DeployApiKey env + Traefik exposure. All nodes share the one ConfigDb.
- **traefik:** single `PathPrefix(/)` router → `central-1` / `central-2`
(sticky cookie); drop the two site routers + services in both
`docker-compose.yml` and `traefik-dynamic.yml`.
- **seed SQL (`seed/seed-clusters.sql`):** MAIN `ClusterNode` rows become
`central-1:4053` / `central-2:4053` (replacing `driver-a` / `driver-b`);
SITE-A / SITE-B keep their `ServerCluster` + 2 `ClusterNode` rows but **no
drivers/tags** (empty sites). Update the `Notes` columns + the file header
comments. The Galaxy namespace / driver / tags stay on MAIN (they run on the
central fused nodes).
- **compose header + comment blocks:** rewrite the topology description (single
mesh, hub-and-spoke, central-only UI).
## Data flow (after the change)
1. Operator clicks **Deploy** in the central UI (or `POST /api/deployments`).
2. `admin-operations` singleton (on a central node) → `ConfigComposer` snapshots
**all** clusters' rows into one artifact → `ConfigPublishCoordinator`
broadcasts `DispatchDeployment` to **all** driver members.
3. Each node resolves its own `ClusterId` from the artifact's `ClusterNode` rows.
4. `DriverHostActor` spawns only its cluster's `DriverInstance`s.
5. `OpcUaPublishActor` materialises only its cluster's address space.
6. Every node acks; the coordinator seals the deployment when all acks arrive.
7. Result: central `:4840`/`:4841` serve MAIN's Galaxy tree; site
`:4842``:4845` serve only their own (empty until configured) trees.
## Error handling
- **Misconfigured node** (multi-cluster mesh, no matching `ClusterNode` row):
applies nothing, logs an error, still acks (so the deploy converges rather than
hanging). Surfaced for the operator to add the missing `ClusterNode` row.
- **Pre-PR / single-cluster artifacts:** `ClusterCount ≤ 1` → lenient no-filter,
identical to current behavior.
- **Empty cluster slice:** node applies an empty plan and acks normally.
## Testing
- **Unit:** `ParseClusterScope` (match / miss / count); `ParseComposition(blob,
clusterId)` (cross-cluster projections excluded; transitive resolution for
UnsLine / Equipment / Tag / ScriptedAlarm); the driver-spec filter predicate
(lenient / strict / unresolved-strict).
- **Integration:** a 2-cluster scoping test on the in-process harness — two
driver nodes assigned to different `ClusterId`s, one deploy, assert each spawns
only its cluster's drivers and materialises only its cluster's tree.
- **Backward-compat:** the existing single-cluster suites must stay green (the
`ClusterCount ≤ 1` lenient branch guarantees this).
- **Live (docker-dev rig):** bring the rig up, sign into the central UI, confirm
3 clusters listed, deploy, confirm `:4840` shows the Galaxy tree and
`:4842`/`:4844` are empty (not the merged tree).
## Classification
High-risk — touches the actor model, the Phase7 data contract, and the deploy
path. The implementation plan will be TDD'd section by section.
## Out of scope (follow-ups)
- **Per-cluster deploy** (deploy just SITE-A from the UI) — global deploy ships
first; per-cluster targeting is a later coordinator + UI enhancement.
- **Seeding demo drivers on the sites** — sites start empty; drivers are added
via the central UI.
@@ -0,0 +1,825 @@
# Per-ClusterId Scoping (hub-and-spoke single mesh) Implementation Plan
> **For Claude:** REQUIRED SUB-SKILL: Use superpowers-extended-cc:executing-plans to implement this plan task-by-task.
**Goal:** Let one central cluster's Admin UI deploy to multiple logically-separate
clusters that share one Akka mesh, with each node applying only its own
`ClusterId`'s drivers + OPC UA address space.
**Architecture:** Approach A — node-side, parse-time filtering. Each node resolves
its own `ClusterId` from the deployment artifact's `ClusterNode` rows (no extra DB
query) and filters both the driver specs and the address-space composition to that
cluster. The coordinator stays a single broadcast; every node applies its own
slice and acks. A single-cluster artifact filters to nothing-different, so existing
deployments + tests are unaffected.
**Tech Stack:** .NET 10, Akka.NET, EF Core, `System.Text.Json` (artifact parse),
xUnit v2 + Shouldly (Runtime.Tests uses Akka.TestKit.Xunit2), Docker Compose + Traefik.
**Design doc:** `docs/plans/2026-06-07-per-cluster-scoping-design.md` (approved).
**The fallback rule (single source of truth — implemented once in `ResolveClusterScope`):**
- artifact has **≤1 cluster** → `None` (apply everything; legacy/single-cluster + all existing tests behave identically).
- artifact has **>1 cluster** and the node's `ClusterNode` row is found → `ScopeTo(clusterId)`.
- artifact has **>1 cluster** and the node's row is **not** found → `Suppress` (apply nothing + the caller logs).
**Hard rules (carry through every task):** never `git add .` — stage by explicit
path; never stage `sql_login.txt` or `src/Server/.../pki/`; never echo the gateway
API key into a *new* tracked file; never force-push or skip hooks.
---
## Task 1: `ResolveClusterScope` + node-scoped `ParseDriverInstances`
**Classification:** standard
**Estimated implement time:** ~5 min
**Parallelizable with:** Task 6, Task 7, Task 8
**Files:**
- Modify: `src/Server/ZB.MOM.WW.OtOpcUa.Runtime/Drivers/DeploymentArtifact.cs`
- Test: `tests/Server/ZB.MOM.WW.OtOpcUa.Runtime.Tests/Drivers/DeploymentArtifactTests.cs`
**Context:** `DeploymentArtifact` is a static JSON decoder over the artifact blob
produced by `ConfigComposer.SnapshotAndFlattenAsync`. The artifact root has
Pascal-case arrays: `Clusters` (ServerCluster, has `ClusterId`), `Nodes`
(ClusterNode, has `NodeId` + `ClusterId`), `DriverInstances` (has `ClusterId`),
`Namespaces`/`UnsAreas` (have `ClusterId`), `Equipment`/`Tags`/`UnsLines`/`ScriptedAlarms`
(no `ClusterId` — traced via `DriverInstanceId`/`UnsAreaId`/`EquipmentId`).
`DriverInstanceSpec` already carries `ClusterId` (`DeploymentArtifact.cs:19`).
**Step 1: Write the failing tests**
Add to `DeploymentArtifactTests.cs`. Reuse the file's existing artifact-blob
helper if present; otherwise add this minimal builder:
```csharp
private static byte[] BlobOf(object snapshot) =>
System.Text.Json.JsonSerializer.SerializeToUtf8Bytes(snapshot);
private static object MultiClusterSnapshot() => new
{
Clusters = new[] { new { ClusterId = "MAIN" }, new { ClusterId = "SITE-A" } },
Nodes = new[]
{
new { NodeId = "central-1:4053", ClusterId = "MAIN" },
new { NodeId = "site-a-1:4053", ClusterId = "SITE-A" },
},
DriverInstances = new[]
{
new { DriverInstanceRowId = Guid.NewGuid(), DriverInstanceId = "main-galaxy", Name = "g", DriverType = "GalaxyMxGateway", Enabled = true, DriverConfig = "{}", ClusterId = "MAIN", NamespaceId = "main-ns" },
new { DriverInstanceRowId = Guid.NewGuid(), DriverInstanceId = "sa-modbus", Name = "m", DriverType = "Modbus", Enabled = true, DriverConfig = "{}", ClusterId = "SITE-A", NamespaceId = "sa-ns" },
},
};
```
```csharp
[Fact]
public void ResolveClusterScope_single_cluster_artifact_returns_None()
{
var blob = BlobOf(new { Clusters = new[] { new { ClusterId = "MAIN" } }, Nodes = Array.Empty<object>() });
var scope = DeploymentArtifact.ResolveClusterScope(blob, "central-1:4053");
scope.Mode.ShouldBe(ClusterFilterMode.None);
}
[Fact]
public void ResolveClusterScope_multi_cluster_known_node_scopes_to_its_cluster()
{
var scope = DeploymentArtifact.ResolveClusterScope(BlobOf(MultiClusterSnapshot()), "site-a-1:4053");
scope.Mode.ShouldBe(ClusterFilterMode.ScopeTo);
scope.ClusterId.ShouldBe("SITE-A");
}
[Fact]
public void ResolveClusterScope_multi_cluster_unknown_node_suppresses()
{
var scope = DeploymentArtifact.ResolveClusterScope(BlobOf(MultiClusterSnapshot()), "ghost-9:4053");
scope.Mode.ShouldBe(ClusterFilterMode.Suppress);
}
[Fact]
public void ParseDriverInstances_scoped_returns_only_my_clusters_drivers()
{
var specs = DeploymentArtifact.ParseDriverInstances(BlobOf(MultiClusterSnapshot()), "central-1:4053");
specs.Select(s => s.DriverInstanceId).ShouldBe(new[] { "main-galaxy" });
}
[Fact]
public void ParseDriverInstances_scoped_unknown_node_returns_empty()
{
var specs = DeploymentArtifact.ParseDriverInstances(BlobOf(MultiClusterSnapshot()), "ghost-9:4053");
specs.ShouldBeEmpty();
}
[Fact]
public void ParseDriverInstances_scoped_single_cluster_returns_all()
{
var blob = BlobOf(new
{
Clusters = new[] { new { ClusterId = "MAIN" } },
Nodes = new[] { new { NodeId = "n1:4053", ClusterId = "MAIN" } },
DriverInstances = new[] { new { DriverInstanceRowId = Guid.NewGuid(), DriverInstanceId = "d1", Name = "d", DriverType = "Modbus", Enabled = true, DriverConfig = "{}", ClusterId = "MAIN" } },
});
DeploymentArtifact.ParseDriverInstances(blob, "anything:4053").Select(s => s.DriverInstanceId).ShouldBe(new[] { "d1" });
}
```
**Step 2: Run the tests — verify they fail**
Run: `dotnet test tests/Server/ZB.MOM.WW.OtOpcUa.Runtime.Tests --filter "FullyQualifiedName~DeploymentArtifactTests"`
Expected: FAIL — `ClusterFilterMode` / `ResolveClusterScope` / the 2-arg `ParseDriverInstances` don't exist.
**Step 3: Implement**
In `DeploymentArtifact.cs`, add the scope types just above `public static class DeploymentArtifact` (top-level, same namespace):
```csharp
/// <summary>How a node should scope a deployment artifact to its own ClusterId.</summary>
public enum ClusterFilterMode { None, ScopeTo, Suppress }
/// <summary>Resolved scoping decision for a node against an artifact.</summary>
/// <param name="Mode">None = apply everything (single-cluster / legacy); ScopeTo = filter to <paramref name="ClusterId"/>; Suppress = apply nothing.</param>
/// <param name="ClusterId">The node's ClusterId when <paramref name="Mode"/> is ScopeTo; otherwise null.</param>
public readonly record struct ClusterScope(ClusterFilterMode Mode, string? ClusterId);
```
Inside the class, add `ResolveClusterScope` and the 2-arg `ParseDriverInstances`
overload (place after the existing `ParseDriverInstances`):
```csharp
/// <summary>
/// Resolve how a node should scope a multi-cluster deployment artifact to its own logical
/// cluster, from the same consistent snapshot it applies (the artifact's ClusterNode rows map
/// NodeId → ClusterId; the ServerCluster count decides single- vs multi-cluster). Fallback rule:
/// ≤1 cluster ⇒ no filter (legacy single-cluster meshes + existing tests unchanged); &gt;1 cluster
/// with the node's row found ⇒ scope to that ClusterId; &gt;1 cluster with the row missing ⇒
/// suppress (apply nothing) — a node in a multi-cluster mesh with no ClusterNode row is
/// misconfigured and must not serve other clusters' data.
/// </summary>
/// <param name="blob">The deployment artifact blob.</param>
/// <param name="nodeId">This node's identity in "host:port" form (matches ClusterNode.NodeId).</param>
/// <returns>The scoping decision for this node.</returns>
public static ClusterScope ResolveClusterScope(ReadOnlySpan<byte> blob, string nodeId)
{
if (blob.IsEmpty) return new ClusterScope(ClusterFilterMode.None, null);
try
{
using var doc = JsonDocument.Parse(blob.ToArray());
var root = doc.RootElement;
var clusterCount = root.TryGetProperty("Clusters", out var cl) && cl.ValueKind == JsonValueKind.Array
? cl.GetArrayLength() : 0;
if (clusterCount <= 1) return new ClusterScope(ClusterFilterMode.None, null);
string? myCluster = null;
if (root.TryGetProperty("Nodes", out var nodes) && nodes.ValueKind == JsonValueKind.Array)
{
foreach (var el in nodes.EnumerateArray())
{
if (el.ValueKind != JsonValueKind.Object) continue;
var nid = el.TryGetProperty("NodeId", out var nEl) ? nEl.GetString() : null;
if (!string.Equals(nid, nodeId, StringComparison.Ordinal)) continue;
myCluster = el.TryGetProperty("ClusterId", out var cEl) ? cEl.GetString() : null;
break;
}
}
return string.IsNullOrWhiteSpace(myCluster)
? new ClusterScope(ClusterFilterMode.Suppress, null)
: new ClusterScope(ClusterFilterMode.ScopeTo, myCluster);
}
catch (JsonException)
{
return new ClusterScope(ClusterFilterMode.None, null);
}
}
/// <summary>Cluster-scoped overload: the driver specs a node should host given its NodeId.</summary>
/// <param name="blob">The deployment artifact blob.</param>
/// <param name="nodeId">This node's identity in "host:port" form.</param>
/// <returns>The filtered driver specs per the node's <see cref="ResolveClusterScope"/> decision.</returns>
public static IReadOnlyList<DriverInstanceSpec> ParseDriverInstances(ReadOnlySpan<byte> blob, string nodeId)
{
var scope = ResolveClusterScope(blob, nodeId);
var all = ParseDriverInstances(blob);
return scope.Mode switch
{
ClusterFilterMode.Suppress => Array.Empty<DriverInstanceSpec>(),
ClusterFilterMode.ScopeTo => all.Where(
s => string.Equals(s.ClusterId, scope.ClusterId, StringComparison.Ordinal)).ToArray(),
_ => all,
};
}
```
**Step 4: Run the tests — verify they pass**
Run: `dotnet test tests/Server/ZB.MOM.WW.OtOpcUa.Runtime.Tests --filter "FullyQualifiedName~DeploymentArtifactTests"`
Expected: PASS (new + all pre-existing DeploymentArtifact tests).
**Step 5: Commit**
```bash
git add src/Server/ZB.MOM.WW.OtOpcUa.Runtime/Drivers/DeploymentArtifact.cs \
tests/Server/ZB.MOM.WW.OtOpcUa.Runtime.Tests/Drivers/DeploymentArtifactTests.cs
git commit -m "feat(runtime): ClusterId scope resolution + node-scoped driver-spec parse"
```
**Acceptance:** `ResolveClusterScope` implements the 3-branch rule; the scoped
`ParseDriverInstances` filters per the rule; the no-arg overload is untouched.
---
## Task 2: Node-scoped `ParseComposition` (address-space filter)
**Classification:** standard
**Estimated implement time:** ~5 min
**Parallelizable with:** Task 6, Task 7, Task 8
**Blocked by:** Task 1 (uses `ClusterScope` / `ResolveClusterScope`, same file).
**Files:**
- Modify: `src/Server/ZB.MOM.WW.OtOpcUa.Runtime/Drivers/DeploymentArtifact.cs`
- Test: `tests/Server/ZB.MOM.WW.OtOpcUa.Runtime.Tests/Drivers/DeploymentArtifactTests.cs`
**Context:** `ParseComposition(blob)` returns a `Phase7CompositionResult` whose
projections carry no `ClusterId`. Filter by building in-cluster id sets from the
raw artifact: `DriverInstanceId`s and `UnsAreaId`s whose row's `ClusterId` matches,
plus `EquipmentId`s whose `DriverInstanceId` is in-cluster. Then filter each
projection (areas by `UnsAreaId`, lines by `UnsAreaId`, equipment by `EquipmentId`,
drivers/galaxyTags/equipmentTags by `DriverInstanceId`, alarms by `EquipmentId`).
**Step 1: Write the failing tests**
Extend `MultiClusterSnapshot()` from Task 1 with namespaces + tags so galaxy-tag
filtering is exercised, then add the test:
```csharp
private static object MultiClusterSnapshotWithTags() => new
{
Clusters = new[] { new { ClusterId = "MAIN" }, new { ClusterId = "SITE-A" } },
Nodes = new[]
{
new { NodeId = "central-1:4053", ClusterId = "MAIN" },
new { NodeId = "site-a-1:4053", ClusterId = "SITE-A" },
},
DriverInstances = new[]
{
new { DriverInstanceId = "main-galaxy", DriverType = "GalaxyMxGateway", DriverConfig = "{}", ClusterId = "MAIN", NamespaceId = "main-ns" },
new { DriverInstanceId = "sa-galaxy", DriverType = "GalaxyMxGateway", DriverConfig = "{}", ClusterId = "SITE-A", NamespaceId = "sa-ns" },
},
Namespaces = new[]
{
new { NamespaceId = "main-ns", ClusterId = "MAIN", Kind = 1 },
new { NamespaceId = "sa-ns", ClusterId = "SITE-A", Kind = 1 },
},
Tags = new[]
{
new { TagId = "t-main", DriverInstanceId = "main-galaxy", EquipmentId = (string?)null, Name = "M1", FolderPath = "F", DataType = "Boolean", TagConfig = "{}" },
new { TagId = "t-sa", DriverInstanceId = "sa-galaxy", EquipmentId = (string?)null, Name = "S1", FolderPath = "F", DataType = "Boolean", TagConfig = "{}" },
},
};
[Fact]
public void ParseComposition_scoped_keeps_only_my_clusters_drivers_and_tags()
{
var blob = BlobOf(MultiClusterSnapshotWithTags());
var main = DeploymentArtifact.ParseComposition(blob, "central-1:4053");
main.DriverInstancePlans.Select(d => d.DriverInstanceId).ShouldBe(new[] { "main-galaxy" });
main.GalaxyTags.Select(t => t.TagId).ShouldBe(new[] { "t-main" });
var siteA = DeploymentArtifact.ParseComposition(blob, "site-a-1:4053");
siteA.DriverInstancePlans.Select(d => d.DriverInstanceId).ShouldBe(new[] { "sa-galaxy" });
siteA.GalaxyTags.Select(t => t.TagId).ShouldBe(new[] { "t-sa" });
}
[Fact]
public void ParseComposition_scoped_unknown_node_is_empty()
{
var comp = DeploymentArtifact.ParseComposition(BlobOf(MultiClusterSnapshotWithTags()), "ghost-9:4053");
comp.GalaxyTags.ShouldBeEmpty();
comp.DriverInstancePlans.ShouldBeEmpty();
}
[Fact]
public void ParseComposition_single_cluster_node_id_overload_matches_legacy()
{
var blob = BlobOf(new
{
Clusters = new[] { new { ClusterId = "MAIN" } },
Nodes = new[] { new { NodeId = "n1:4053", ClusterId = "MAIN" } },
DriverInstances = new[] { new { DriverInstanceId = "d1", DriverType = "Modbus", DriverConfig = "{}", ClusterId = "MAIN", NamespaceId = "ns" } },
});
DeploymentArtifact.ParseComposition(blob, "anything:4053").DriverInstancePlans.Count
.ShouldBe(DeploymentArtifact.ParseComposition(blob).DriverInstancePlans.Count);
}
```
**Step 2: Run — verify FAIL** (2-arg `ParseComposition` missing):
`dotnet test tests/Server/ZB.MOM.WW.OtOpcUa.Runtime.Tests --filter "FullyQualifiedName~DeploymentArtifactTests"`
**Step 3: Implement**
Add to `DeploymentArtifact.cs` (after the no-arg `ParseComposition`):
```csharp
/// <summary>Cluster-scoped overload: the address-space composition a node should materialise given
/// its NodeId. Filters every projection to the node's own ClusterId (see <see cref="ResolveClusterScope"/>).</summary>
/// <param name="blob">The deployment artifact blob.</param>
/// <param name="nodeId">This node's identity in "host:port" form.</param>
/// <returns>The filtered composition per the node's scoping decision.</returns>
public static Phase7CompositionResult ParseComposition(ReadOnlySpan<byte> blob, string nodeId)
{
var scope = ResolveClusterScope(blob, nodeId);
if (scope.Mode == ClusterFilterMode.None) return ParseComposition(blob);
if (scope.Mode == ClusterFilterMode.Suppress) return Empty();
var full = ParseComposition(blob);
var sets = BuildClusterSets(blob, scope.ClusterId!);
return new Phase7CompositionResult(
full.UnsAreas.Where(a => sets.AreaIds.Contains(a.UnsAreaId)).ToArray(),
full.UnsLines.Where(l => sets.AreaIds.Contains(l.UnsAreaId)).ToArray(),
full.EquipmentNodes.Where(e => sets.EquipmentIds.Contains(e.EquipmentId)).ToArray(),
full.DriverInstancePlans.Where(d => sets.DriverIds.Contains(d.DriverInstanceId)).ToArray(),
full.ScriptedAlarmPlans.Where(a => sets.EquipmentIds.Contains(a.EquipmentId)).ToArray(),
full.GalaxyTags.Where(t => sets.DriverIds.Contains(t.DriverInstanceId)).ToArray())
{
EquipmentTags = full.EquipmentTags.Where(t => sets.DriverIds.Contains(t.DriverInstanceId)).ToArray(),
};
}
private sealed record ClusterSets(HashSet<string> DriverIds, HashSet<string> AreaIds, HashSet<string> EquipmentIds);
/// <summary>Build the in-cluster id sets used to filter a composition: DriverInstanceIds + UnsAreaIds
/// that directly carry the ClusterId, plus EquipmentIds whose DriverInstanceId is in-cluster.</summary>
private static ClusterSets BuildClusterSets(ReadOnlySpan<byte> blob, string clusterId)
{
var driverIds = new HashSet<string>(StringComparer.Ordinal);
var areaIds = new HashSet<string>(StringComparer.Ordinal);
var equipmentIds = new HashSet<string>(StringComparer.Ordinal);
try
{
using var doc = JsonDocument.Parse(blob.ToArray());
var root = doc.RootElement;
CollectIdsWhereCluster(root, "DriverInstances", "DriverInstanceId", clusterId, driverIds);
CollectIdsWhereCluster(root, "UnsAreas", "UnsAreaId", clusterId, areaIds);
// Equipment carries no ClusterId — include it when its DriverInstanceId is in-cluster.
if (root.TryGetProperty("Equipment", out var eq) && eq.ValueKind == JsonValueKind.Array)
{
foreach (var el in eq.EnumerateArray())
{
if (el.ValueKind != JsonValueKind.Object) continue;
var di = el.TryGetProperty("DriverInstanceId", out var diEl) ? diEl.GetString() : null;
var id = el.TryGetProperty("EquipmentId", out var idEl) ? idEl.GetString() : null;
if (!string.IsNullOrWhiteSpace(id) && di is not null && driverIds.Contains(di))
equipmentIds.Add(id!);
}
}
}
catch (JsonException) { /* empty sets ⇒ nothing matches ⇒ empty composition */ }
return new ClusterSets(driverIds, areaIds, equipmentIds);
}
private static void CollectIdsWhereCluster(
JsonElement root, string arrayName, string idField, string clusterId, HashSet<string> into)
{
if (!root.TryGetProperty(arrayName, out var arr) || arr.ValueKind != JsonValueKind.Array) return;
foreach (var el in arr.EnumerateArray())
{
if (el.ValueKind != JsonValueKind.Object) continue;
var cid = el.TryGetProperty("ClusterId", out var cEl) ? cEl.GetString() : null;
if (!string.Equals(cid, clusterId, StringComparison.Ordinal)) continue;
var id = el.TryGetProperty(idField, out var idEl) ? idEl.GetString() : null;
if (!string.IsNullOrWhiteSpace(id)) into.Add(id!);
}
}
```
Note: equipment is filtered via its `DriverInstanceId` (schema-guaranteed present
for equipment-namespace rows). If a future schema allows equipment with a null
`DriverInstanceId`, extend `BuildClusterSets` to also include equipment whose
`UnsLineId` maps to an in-cluster `UnsArea` — out of scope here (the dev rig's
sites are empty).
**Step 4: Run — verify PASS** (new + pre-existing tests):
`dotnet test tests/Server/ZB.MOM.WW.OtOpcUa.Runtime.Tests --filter "FullyQualifiedName~DeploymentArtifactTests"`
**Step 5: Commit**
```bash
git add src/Server/ZB.MOM.WW.OtOpcUa.Runtime/Drivers/DeploymentArtifact.cs \
tests/Server/ZB.MOM.WW.OtOpcUa.Runtime.Tests/Drivers/DeploymentArtifactTests.cs
git commit -m "feat(runtime): node-scoped ParseComposition filters address space by ClusterId"
```
**Acceptance:** scoped `ParseComposition` excludes cross-cluster projections;
single-cluster + unknown-node behavior matches the rule; no-arg overload untouched.
---
## Task 3: Wire driver-spawn + SubscribeBulk filtering into `DriverHostActor`
**Classification:** high-risk
**Estimated implement time:** ~5 min
**Parallelizable with:** Task 4
**Blocked by:** Task 1, Task 2.
**Files:**
- Modify: `src/Server/ZB.MOM.WW.OtOpcUa.Runtime/Drivers/DriverHostActor.cs:367` (ReconcileDrivers) and `:432` (PushDesiredSubscriptions)
- Test: `tests/Server/ZB.MOM.WW.OtOpcUa.Runtime.Tests/Drivers/` (add `DriverHostActorClusterScopeTests.cs`, or extend an existing DriverHostActor test if present)
**Context:** `ReconcileDrivers` (line 349) loads the artifact blob then calls
`ParseDriverInstances(blob)`. `PushDesiredSubscriptions` (line 412) calls
`ParseComposition(blob)`. Both run for normal applies (`ApplyAndAck`, line 311/321)
**and** restart restore (`RestoreApplied`, line 393/395) — so changing these two
call sites covers both paths. The ack (`SendAck`, line 314) fires unconditionally
*before* the rebuild, so an empty/suppressed slice still acks — no ack change needed.
**Step 1: Write the failing test**
A TestKit test that a driver node in a multi-cluster artifact spawns only its
cluster's drivers, and a node whose cluster has no drivers still reaches Applied
(acks). Model it on the existing DriverHostActor tests in the same folder (reuse
their in-memory DbContext + DispatchDeployment plumbing — inspect a sibling test
for the exact harness helpers). Core assertions:
```csharp
// Given a sealed deployment whose artifact has 2 clusters (MAIN: 1 driver, SITE-A: 1 driver)
// and a DriverHostActor whose _localNode = "site-a-1:4053":
// - after DispatchDeployment, GetDiagnostics shows ONLY the SITE-A driver (not MAIN's)
// - the node sends an Applied ApplyAck (convergence holds even though it ignored MAIN's driver)
// And a second actor with _localNode = "central-1:4053" shows ONLY the MAIN driver.
```
**Step 2: Run — verify FAIL** (node currently spawns both clusters' drivers):
`dotnet test tests/Server/ZB.MOM.WW.OtOpcUa.Runtime.Tests --filter "FullyQualifiedName~DriverHostActorClusterScope"`
**Step 3: Implement** — two one-line call-site changes:
`DriverHostActor.cs:367`:
```csharp
// before:
var specs = DeploymentArtifact.ParseDriverInstances(blob);
// after:
var specs = DeploymentArtifact.ParseDriverInstances(blob, _localNode.Value);
```
`DriverHostActor.cs:432`:
```csharp
// before:
composition = DeploymentArtifact.ParseComposition(blob);
// after:
composition = DeploymentArtifact.ParseComposition(blob, _localNode.Value);
```
**Step 4: Run — verify PASS, then the whole Runtime suite for no regression:**
```bash
dotnet test tests/Server/ZB.MOM.WW.OtOpcUa.Runtime.Tests
```
Expected: new test green; all pre-existing tests green (single-cluster harnesses
hit the `None` branch → unchanged).
**Step 5: Commit**
```bash
git add src/Server/ZB.MOM.WW.OtOpcUa.Runtime/Drivers/DriverHostActor.cs \
tests/Server/ZB.MOM.WW.OtOpcUa.Runtime.Tests/Drivers/DriverHostActorClusterScopeTests.cs
git commit -m "feat(runtime): DriverHost spawns + subscribes only its own ClusterId's drivers"
```
**Acceptance:** a site node spawns only its cluster's drivers and still acks
Applied with an empty slice; existing single-cluster tests stay green.
---
## Task 4: Wire scoped composition into `OpcUaPublishActor.HandleRebuild`
**Classification:** high-risk
**Estimated implement time:** ~5 min
**Parallelizable with:** Task 3
**Blocked by:** Task 2.
**Files:**
- Modify: `src/Server/ZB.MOM.WW.OtOpcUa.Runtime/OpcUa/OpcUaPublishActor.cs:212` (HandleRebuild)
- Test: `tests/Server/ZB.MOM.WW.OtOpcUa.Runtime.Tests/OpcUa/OpcUaPublishActorRebuildTests.cs`
**Context:** `HandleRebuild` (line ~210) loads the artifact then calls
`ParseComposition(artifact)` and materialises via `Phase7Applier`. `_localNode` is
`NodeId?` (line 46) — null on legacy/dev callers, so guard for null. The existing
`OpcUaPublishActorRebuildTests` use a fake/inspectable sink — reuse that pattern.
**Step 1: Write the failing test**
```csharp
// Build a 2-cluster artifact (MAIN galaxy tag t-main; SITE-A galaxy tag t-sa),
// seal it as a Deployment row in the test DbContext, construct an OpcUaPublishActor
// with _localNode = NodeId.Parse("site-a-1:4053") and an inspectable sink, send
// RebuildAddressSpace(correlation, depId), then assert the sink received ONLY the
// SITE-A variable/folders (t-sa) and NOT the MAIN ones (t-main).
// Mirror with _localNode = "central-1:4053" → only MAIN.
```
**Step 2: Run — verify FAIL** (sink currently gets both clusters' nodes).
**Step 3: Implement**`OpcUaPublishActor.cs:212`:
```csharp
// before:
var composition = DeploymentArtifact.ParseComposition(artifact);
// after: scope to this node's ClusterId when we know our identity; legacy/dev callers (null
// _localNode) keep the unscoped behaviour.
var composition = _localNode is { } ln
? DeploymentArtifact.ParseComposition(artifact, ln.Value)
: DeploymentArtifact.ParseComposition(artifact);
```
**Step 4: Run — verify PASS + full Runtime suite:**
```bash
dotnet test tests/Server/ZB.MOM.WW.OtOpcUa.Runtime.Tests
```
**Step 5: Commit**
```bash
git add src/Server/ZB.MOM.WW.OtOpcUa.Runtime/OpcUa/OpcUaPublishActor.cs \
tests/Server/ZB.MOM.WW.OtOpcUa.Runtime.Tests/OpcUa/OpcUaPublishActorRebuildTests.cs
git commit -m "feat(runtime): OPC UA rebuild materialises only the node's ClusterId slice"
```
**Acceptance:** a node materialises only its own cluster's address space; null
`_localNode` keeps legacy behavior; existing rebuild tests stay green.
---
## Task 5: Multi-cluster scoping E2E on the cluster harness
**Classification:** high-risk
**Estimated implement time:** ~5 min
**Parallelizable with:** none
**Blocked by:** Task 3, Task 4.
**Files:**
- Test: `tests/Server/ZB.MOM.WW.OtOpcUa.Host.IntegrationTests/` (add `MultiClusterScopingTests.cs`)
- Possibly Modify: `tests/Server/ZB.MOM.WW.OtOpcUa.Host.IntegrationTests/TwoNodeClusterHarness.cs` (only if a 2-ClusterId seed helper is needed)
**Context:** `TwoNodeClusterHarness` boots an in-process 2-node cluster on an
in-memory DB with a null OPC UA sink. It proves the *deploy path* end-to-end
(compose → broadcast → apply → ack) but cannot assert a materialised tree (null
sink). So this test asserts **driver** scoping through the real path: seed two
`ServerCluster` rows + two `ClusterNode` rows (one per node, different `ClusterId`)
+ one `DriverInstance` per cluster, run one deployment, and assert via each node's
`GetDiagnostics` that each node hosts only its own cluster's driver.
If the harness's seed helpers can't express two clusters cleanly, that's a plan
defect — surface it; the unit + actor tests (Tasks 14) already cover the scoping
logic, and Task 9 covers the live proof.
**Step 1: Write the failing test** (per the context above).
**Step 2: Run — verify FAIL.**
**Step 3:** No production code — this test passes once Tasks 3+4 are in. If it
fails, the defect is in 3/4; fix there, not here.
**Step 4: Run — verify PASS:**
`dotnet test tests/Server/ZB.MOM.WW.OtOpcUa.Host.IntegrationTests --filter "FullyQualifiedName~MultiClusterScoping"`
**Step 5: Commit**
```bash
git add tests/Server/ZB.MOM.WW.OtOpcUa.Host.IntegrationTests/MultiClusterScopingTests.cs
# add TwoNodeClusterHarness.cs ONLY if you modified it
git commit -m "test(integration): multi-cluster deploy scopes drivers per node"
```
**Acceptance:** one deploy over a 2-cluster mesh leaves each node hosting only its
own cluster's driver.
---
## Task 6: Rewrite `docker-dev/docker-compose.yml` for the single mesh
**Classification:** standard
**Estimated implement time:** ~5 min
**Parallelizable with:** Task 1, Task 2, Task 7, Task 8
**Files:**
- Modify: `docker-dev/docker-compose.yml`
**Context:** Today MAIN is 4 nodes (`admin-a`/`admin-b` admin + `driver-a`/`driver-b`
driver) as one mesh; SITE-A/SITE-B are 2-node fused meshes with their own seeds.
The anchor `&otopcua-host` (currently on `admin-a`) holds the shared build + env.
**Steps:**
1. Replace the four MAIN services with **two fused nodes**:
- `central-1` (becomes the `&otopcua-host` anchor): `OTOPCUA_ROLES: "admin,driver"`,
`ASPNETCORE_URLS: "http://+:9000"`, `Cluster__PublicHostname: "central-1"`,
`Cluster__SeedNodes__0: "akka.tcp://otopcua@central-1:4053"`, `Cluster__Roles__0: "admin"`,
`Cluster__Roles__1: "driver"`, keep all `Security__*` (Jwt/Ldap/DeployApiKey) + `GALAXY_MXGW_API_KEY`
(keep the existing `${GALAXY_MXGW_API_KEY:-mxgw_otopcua2_...}` default — do **not** introduce a new
hardcoded key), `ports: ["4840:4840"]`.
- `central-2`: same env, `Cluster__PublicHostname: "central-2"`, seed → `central-1`,
`ports: ["4841:4840"]`, `depends_on: { sql: healthy, central-1: started }`.
2. Convert the four site services to **driver-only**, all seeding `central-1`:
- For each of `site-a-1`, `site-a-2`, `site-b-1`, `site-b-2`: `OTOPCUA_ROLES: "driver"`,
`Cluster__Roles__0: "driver"` (remove the `admin` role + `Cluster__Roles__1`), keep
`Cluster__PublicHostname` = own name, set `Cluster__SeedNodes__0: "akka.tcp://otopcua@central-1:4053"`,
**remove** `ASPNETCORE_URLS` + the `Security__Jwt__*` / `Security__Ldap__*` / `Security__DeployApiKey`
block (driver-only nodes serve no UI and authenticate no users), keep the `ConnectionStrings__ConfigDb`
+ `GALAXY_MXGW_API_KEY` lines, keep their OPC UA ports (`4842``4845`), and add
`depends_on: { sql: healthy, central-1: started }`.
3. Update `traefik.depends_on` to `[central-1, central-2]` (drop the removed services).
4. Rewrite the header comment block (lines ~140) to describe the **single mesh,
hub-and-spoke** topology: one Akka mesh seeded by `central-1`; `central-1/2`
are `admin,driver` (the only UI + deploy singleton); `site-*` are `driver`-only
members scoped by `ClusterId`; central UI at `:9200` manages + deploys to all.
Keep the existing accurate notes (SQL persistence, mesh isolation note now
describes a single mesh, headless deploy via `:9200/api/deployments`).
**Verify:**
```bash
docker compose -f docker-dev/docker-compose.yml config --quiet && echo "compose OK"
```
Expected: `compose OK`.
**Commit:**
```bash
git add docker-dev/docker-compose.yml
git commit -m "feat(docker-dev): single-mesh hub-and-spoke (central-1/2 + driver-only sites)"
```
**Acceptance:** `docker compose config` parses; central nodes fused with UI; site
nodes driver-only seeding central; no new hardcoded API key.
---
## Task 7: Rewrite `docker-dev/traefik-dynamic.yml` (central-only route)
**Classification:** small
**Estimated implement time:** ~3 min
**Parallelizable with:** Task 1, Task 2, Task 6, Task 8
**Files:**
- Modify: `docker-dev/traefik-dynamic.yml`
**Steps:**
1. Keep the `otopcua-admin` router (`PathPrefix(`/`)`) + service, but point its
`loadBalancer.servers` at `http://central-1:9000` and `http://central-2:9000`.
2. Delete the `otopcua-site-a` and `otopcua-site-b` routers **and** their services
(driver-only sites serve no UI). Keep the sticky-cookie + `/health/active`
healthcheck on the surviving `otopcua-admin` service.
3. Update the file header comment to describe the single central UI route.
**Verify:** covered by Task 6's `docker compose config` (Traefik file is mounted,
not parsed by compose) — sanity-check it's valid YAML by eye; the live bring-up in
Task 9 is the real check.
**Commit:**
```bash
git add docker-dev/traefik-dynamic.yml
git commit -m "feat(docker-dev): Traefik routes only the central cluster UI"
```
**Acceptance:** one router → central-1/central-2; site routers/services removed.
---
## Task 8: Rewrite `docker-dev/seed/seed-clusters.sql` (MAIN nodes → central-1/2)
**Classification:** small
**Estimated implement time:** ~4 min
**Parallelizable with:** Task 1, Task 2, Task 6, Task 7
**Files:**
- Modify: `docker-dev/seed/seed-clusters.sql`
**Context:** The seed inserts 3 `ServerCluster` rows + 6 `ClusterNode` rows. MAIN's
`ClusterNode` rows are currently `driver-a:4053` / `driver-b:4053`. Sites already
have **no** drivers/tags (only `ServerCluster` + `ClusterNode`), which matches the
"empty sites" decision — leave them empty.
**Steps:**
1. Change MAIN's two `ClusterNode` inserts from `driver-a` / `driver-b` to
`central-1` / `central-2`: `NodeId` `central-1:4053` / `central-2:4053`,
`Host` `central-1` / `central-2`, `ApplicationUri` `urn:OtOpcUa:central-1` /
`urn:OtOpcUa:central-2`, keep `OpcUaPort 4840`, `ServiceLevelBase` 200/150.
Update the `IF NOT EXISTS ... WHERE NodeId = '...'` guards to the new ids.
2. Update the MAIN `ServerCluster` `Notes` to "central-1/central-2 fused
admin+driver — UI + deploy singleton + MAIN OPC UA publishers."
3. Update the SITE-A/SITE-B `ServerCluster` `Notes` to "2-node driver-only,
managed by the central cluster over the shared mesh (empty until configured)."
4. Update the file header comment block (the `ClusterNode` map at lines ~57) to
`central-1, central-2 → MAIN`.
5. Leave the Galaxy namespace/driver/tags (MAIN) and the LDAP→role mappings
unchanged.
**Verify:** SQL isn't run locally on macOS (no SQL reachable); correctness is
confirmed by the live bring-up in Task 9 (the `cluster-seed` job runs it). Eyeball
that every changed `NodeId`/guard is consistent.
**Commit:**
```bash
git add docker-dev/seed/seed-clusters.sql
git commit -m "feat(docker-dev): seed MAIN ClusterNodes as central-1/central-2"
```
**Acceptance:** MAIN `ClusterNode` rows are `central-1`/`central-2`; sites keep
their cluster + node rows with no drivers; notes/header updated.
---
## Task 9: Live docker-dev verification
**Classification:** standard (verification — no subagent review needed)
**Estimated implement time:** ~5 min (plus container build time)
**Parallelizable with:** none
**Blocked by:** Task 3, Task 4, Task 5, Task 6, Task 7, Task 8.
**Files:** none (operational).
**Steps (run from repo root on this Mac; Docker is local):**
1. Sync deployment + rebuild the image and bring the rig up:
```bash
docker compose -f docker-dev/docker-compose.yml down
docker compose -f docker-dev/docker-compose.yml up -d --build
```
2. Confirm the mesh formed and the seed ran:
```bash
docker compose -f docker-dev/docker-compose.yml ps
docker compose -f docker-dev/docker-compose.yml logs cluster-seed --tail=40
```
Expect 3 `ServerCluster` rows + 6 `ClusterNode` rows (`central-1/2`, `site-a-1/2`, `site-b-1/2`).
3. Trigger a global deploy headlessly (no UI login needed — the deploy API is on
the central admin nodes):
```bash
curl -s -X POST http://localhost:9200/api/deployments \
-H "X-Api-Key: docker-dev-deploy-key" -H "Content-Type: application/json" \
-d '{"createdBy":"per-cluster-verify"}'
```
Expect `202` + a `deploymentId`.
4. Confirm scoping in the driver logs — central applies the Galaxy driver, sites apply empty:
```bash
docker compose -f docker-dev/docker-compose.yml logs central-1 | grep -iE "applied deployment|galaxy|materialis"
docker compose -f docker-dev/docker-compose.yml logs site-a-1 | grep -iE "applied deployment|galaxy|materialis"
```
Expect `central-1` to materialise the MAIN Galaxy tags; `site-a-1` to apply with **no** Galaxy/MAIN nodes.
5. Browse-check with the Client CLI:
```bash
dotnet run --project src/Client/ZB.MOM.WW.OtOpcUa.Client.CLI -- browse -u opc.tcp://localhost:4840 -r -d 4 # central: Galaxy tree
dotnet run --project src/Client/ZB.MOM.WW.OtOpcUa.Client.CLI -- browse -u opc.tcp://localhost:4842 -r -d 4 # site-a: empty (no MAIN tree)
```
Expect the MAIN Galaxy hierarchy on `:4840` and an **empty** address space on `:4842` (NOT the merged tree).
**Acceptance:** the rig boots as one mesh; a single deploy populates the central
OPC UA tree and leaves the site nodes empty (proving per-ClusterId scoping live).
If anything regresses, stop and debug (do not paper over).
---
## Task 10: Update docker-dev docs + memory
**Classification:** small
**Estimated implement time:** ~4 min
**Parallelizable with:** Task 9
**Blocked by:** Task 6 (final topology).
**Files:**
- Modify: `CLAUDE.md` (the Docker Workflow / docker-dev references that mention the cluster layout)
- Modify: `docs/v2/dev-environment.md` (if it describes the three-isolated-mesh topology)
- Modify: `/Users/dohertj2/.claude/projects/-Users-dohertj2-Desktop-OtOpcUa/memory/project_dev_environment.md` + `MEMORY.md` pointer
**Steps:**
1. In `CLAUDE.md` and `docs/v2/dev-environment.md`, update any description of the
docker-dev topology from "three isolated meshes / MAIN admin-a+admin-b /
site fused" to "single mesh, hub-and-spoke: `central-1`/`central-2` fused
admin+driver own the only UI + deploy singleton; `site-*` are driver-only
members scoped by `ClusterId`; central UI at `:9200` deploys to all." Update
the OPC UA endpoint list (`central-1` `:4840`, `central-2` `:4841`, sites
`:4842``:4845`).
2. Update the `project_dev_environment.md` memory's docker-dev section to match
(node names, hub-and-spoke, "central UI deploys to all clusters"). Keep the
one-line `MEMORY.md` pointer accurate.
**Commit:**
```bash
git add CLAUDE.md docs/v2/dev-environment.md
git commit -m "docs(docker-dev): document single-mesh hub-and-spoke topology"
```
(The memory files live outside the repo — write them with the memory workflow, not git.)
**Acceptance:** docs + memory describe the new topology accurately.
---
## Done criteria
- `dotnet build ZB.MOM.WW.OtOpcUa.slnx` clean.
- `dotnet test tests/Server/ZB.MOM.WW.OtOpcUa.Runtime.Tests` green (new scoping
tests + no regressions) and `...Host.IntegrationTests` multi-cluster test green.
- The full pre-existing suite stays green (the `ClusterCount ≤ 1` lenient branch
guarantees single-cluster behavior is unchanged).
- `docker compose config` parses; the live rig (Task 9) shows central serving the
Galaxy tree and sites empty under one global deploy.
## Out of scope (follow-ups)
- **Per-cluster deploy** (deploy just SITE-A from the UI) — coordinator + UI work.
- **Seeding demo drivers on the sites** — added via the central UI.
@@ -0,0 +1,16 @@
{
"planPath": "docs/plans/2026-06-07-per-cluster-scoping.md",
"tasks": [
{"id": 1, "subject": "Task 1: ResolveClusterScope + node-scoped ParseDriverInstances", "status": "pending"},
{"id": 2, "subject": "Task 2: Node-scoped ParseComposition (address-space filter)", "status": "pending", "blockedBy": [1]},
{"id": 3, "subject": "Task 3: Wire driver-spawn + SubscribeBulk filtering into DriverHostActor", "status": "pending", "blockedBy": [1, 2]},
{"id": 4, "subject": "Task 4: Wire scoped composition into OpcUaPublishActor.HandleRebuild", "status": "pending", "blockedBy": [2]},
{"id": 5, "subject": "Task 5: Multi-cluster scoping E2E on the cluster harness", "status": "pending", "blockedBy": [3, 4]},
{"id": 6, "subject": "Task 6: Rewrite docker-dev/docker-compose.yml for the single mesh", "status": "pending"},
{"id": 7, "subject": "Task 7: Rewrite docker-dev/traefik-dynamic.yml (central-only route)", "status": "pending"},
{"id": 8, "subject": "Task 8: Rewrite docker-dev/seed/seed-clusters.sql (MAIN nodes -> central-1/2)", "status": "pending"},
{"id": 9, "subject": "Task 9: Live docker-dev verification", "status": "pending", "blockedBy": [3, 4, 5, 6, 7, 8]},
{"id": 10, "subject": "Task 10: Update docker-dev docs + memory", "status": "pending", "blockedBy": [6]}
],
"lastUpdated": "2026-06-07"
}
@@ -30,10 +30,15 @@ public sealed class ConfigPublishCoordinator : ReceiveActor, IWithTimers
private readonly IDbContextFactory<OtOpcUaConfigDbContext> _dbFactory;
private readonly TimeSpan _applyDeadline;
private readonly ILoggingAdapter _log = Context.GetLogger();
private readonly Dictionary<NodeId, ApplyAckOutcome> _acks = new();
// NodeId equality here is case-insensitive (by Value) to match the case-insensitive ClusterId/
// NodeId scoping in DeploymentArtifact.ResolveClusterScope — so an ack from a node whose address
// differs only in case still matches its expected-ack entry (SQL collation + DNS are
// case-insensitive, so the same node can surface with different casing).
private static readonly IEqualityComparer<NodeId> NodeIdComparer = new CaseInsensitiveNodeIdComparer();
private readonly Dictionary<NodeId, ApplyAckOutcome> _acks = new(NodeIdComparer);
private DeploymentId? _current;
private HashSet<NodeId> _expectedAcks = new();
private HashSet<NodeId> _expectedAcks = new(NodeIdComparer);
/// <summary>Gets the timer scheduler for managing apply deadlines.</summary>
public ITimerScheduler Timers { get; set; } = null!;
@@ -88,7 +93,7 @@ public sealed class ConfigPublishCoordinator : ReceiveActor, IWithTimers
.AsNoTracking()
.ToList();
_expectedAcks = nodeStates.Select(s => NodeId.Parse(s.NodeId)).ToHashSet();
_expectedAcks = nodeStates.Select(s => NodeId.Parse(s.NodeId)).ToHashSet(NodeIdComparer);
foreach (var s in nodeStates.Where(s => s.Status != NodeDeploymentStatus.Applying))
_acks[NodeId.Parse(s.NodeId)] = s.Status == NodeDeploymentStatus.Applied
? ApplyAckOutcome.Applied
@@ -248,7 +253,7 @@ public sealed class ConfigPublishCoordinator : ReceiveActor, IWithTimers
private HashSet<NodeId> DiscoverDriverNodes()
{
var cluster = Akka.Cluster.Cluster.Get(Context.System);
var nodes = new HashSet<NodeId>();
var nodes = new HashSet<NodeId>(NodeIdComparer);
foreach (var member in cluster.State.Members)
{
if (member.Status is not (MemberStatus.Up or MemberStatus.Joining)) continue;
@@ -261,4 +266,18 @@ public sealed class ConfigPublishCoordinator : ReceiveActor, IWithTimers
}
return nodes;
}
/// <summary>Case-insensitive <see cref="NodeId"/> equality (by <see cref="NodeId.Value"/>),
/// matching the case-insensitive scoping in <c>DeploymentArtifact.ResolveClusterScope</c> so the
/// expected-ack set and incoming acks agree regardless of host-name casing.</summary>
private sealed class CaseInsensitiveNodeIdComparer : IEqualityComparer<NodeId>
{
/// <inheritdoc />
public bool Equals(NodeId x, NodeId y) =>
string.Equals(x.Value, y.Value, StringComparison.OrdinalIgnoreCase);
/// <inheritdoc />
public int GetHashCode(NodeId obj) =>
StringComparer.OrdinalIgnoreCase.GetHashCode(obj.Value ?? string.Empty);
}
}
@@ -18,6 +18,27 @@ public sealed record DriverInstanceSpec(
string DriverConfig,
string? ClusterId = null);
/// <summary>How a node should scope a deployment artifact to its own ClusterId.</summary>
public enum ClusterFilterMode
{
/// <summary>Apply everything (single-cluster / legacy deployments).</summary>
None,
/// <summary>Filter the artifact to the node's own ClusterId.</summary>
ScopeTo,
/// <summary>Apply nothing (node's cluster row not found in a multi-cluster artifact).</summary>
Suppress,
}
/// <summary>Resolved scoping decision for a node against an artifact.</summary>
/// <param name="Mode">
/// None = apply everything (single-cluster / legacy); ScopeTo = filter to <paramref name="ClusterId"/>;
/// Suppress = apply nothing.
/// </param>
/// <param name="ClusterId">The node's ClusterId when <paramref name="Mode"/> is ScopeTo; otherwise null.</param>
public readonly record struct ClusterScope(ClusterFilterMode Mode, string? ClusterId);
public static class DeploymentArtifact
{
private static readonly JsonSerializerOptions JsonOptions = new()
@@ -58,6 +79,70 @@ public static class DeploymentArtifact
}
}
/// <summary>
/// Resolve how a node should scope a deployment artifact to its own ClusterId. Single-cluster
/// (or legacy) artifacts resolve to <see cref="ClusterFilterMode.None"/> so every existing
/// deployment applies unchanged. In a multi-cluster artifact the node's <c>ClusterNode</c> row
/// (matched by <paramref name="nodeId"/>) selects <see cref="ClusterFilterMode.ScopeTo"/> its
/// ClusterId; a missing row resolves to <see cref="ClusterFilterMode.Suppress"/>. Empty /
/// malformed blobs resolve to <see cref="ClusterFilterMode.None"/> (lenient, matching the
/// other parsers).
/// </summary>
/// <param name="blob">The deployment artifact blob to inspect.</param>
/// <param name="nodeId">The node's identity (e.g. "central-1:4053") to match against <c>Nodes</c>.</param>
/// <returns>The resolved <see cref="ClusterScope"/> decision for this node.</returns>
public static ClusterScope ResolveClusterScope(ReadOnlySpan<byte> blob, string nodeId)
{
if (blob.IsEmpty) return new ClusterScope(ClusterFilterMode.None, null);
try
{
using var doc = JsonDocument.Parse(blob.ToArray());
var root = doc.RootElement;
var clusterCount = root.TryGetProperty("Clusters", out var cl) && cl.ValueKind == JsonValueKind.Array
? cl.GetArrayLength() : 0;
if (clusterCount <= 1) return new ClusterScope(ClusterFilterMode.None, null);
string? myCluster = null;
if (root.TryGetProperty("Nodes", out var nodes) && nodes.ValueKind == JsonValueKind.Array)
{
foreach (var el in nodes.EnumerateArray())
{
if (el.ValueKind != JsonValueKind.Object) continue;
var nid = el.TryGetProperty("NodeId", out var nEl) ? nEl.GetString() : null;
if (!string.Equals(nid, nodeId, StringComparison.OrdinalIgnoreCase)) continue;
myCluster = el.TryGetProperty("ClusterId", out var cEl) ? cEl.GetString() : null;
break;
}
}
return string.IsNullOrWhiteSpace(myCluster)
? new ClusterScope(ClusterFilterMode.Suppress, null)
: new ClusterScope(ClusterFilterMode.ScopeTo, myCluster);
}
catch (JsonException)
{
return new ClusterScope(ClusterFilterMode.None, null);
}
}
/// <summary>
/// Parse a deployment artifact blob into the driver-instance specs a specific node should
/// spawn, scoped to its own ClusterId via <see cref="ResolveClusterScope"/>. Single-cluster /
/// legacy artifacts return every spec; a multi-cluster artifact returns only the matching
/// cluster's specs (or none when the node's row is absent).
/// </summary>
/// <param name="blob">The deployment artifact blob to parse.</param>
/// <param name="nodeId">The node's identity (e.g. "central-1:4053") used to resolve cluster scope.</param>
/// <returns>The driver-instance specs this node should spawn.</returns>
public static IReadOnlyList<DriverInstanceSpec> ParseDriverInstances(ReadOnlySpan<byte> blob, string nodeId)
{
var scope = ResolveClusterScope(blob, nodeId);
if (scope.Mode == ClusterFilterMode.Suppress) return Array.Empty<DriverInstanceSpec>();
var all = ParseDriverInstances(blob);
return scope.Mode == ClusterFilterMode.ScopeTo
? all.Where(s => string.Equals(s.ClusterId, scope.ClusterId, StringComparison.OrdinalIgnoreCase)).ToArray()
: all;
}
private static DriverInstanceSpec? TryReadSpec(JsonElement el)
{
var rowId = el.TryGetProperty("DriverInstanceRowId", out var rowEl)
@@ -120,6 +205,115 @@ public static class DeploymentArtifact
}
}
/// <summary>Cluster-scoped overload: the address-space composition a node should materialise given
/// its NodeId. Filters every projection to the node's own ClusterId (see <see cref="ResolveClusterScope"/>).
/// When <paramref name="onInconsistency"/> is supplied it is invoked with a human-readable message for each
/// kept equipment whose owning UNS line is NOT in the node's cluster — a cross-cluster binding that
/// violates the same-cluster invariant (decision #122) and would orphan the equipment folder. This is
/// detection only (observability); the equipment is still returned, since the upstream draft validator
/// is the authority that should prevent the binding in the first place.</summary>
/// <param name="blob">The deployment artifact blob.</param>
/// <param name="nodeId">This node's identity in "host:port" form.</param>
/// <param name="onInconsistency">Optional diagnostic callback for cross-cluster orphan bindings; null disables the check.</param>
/// <returns>The filtered composition per the node's scoping decision.</returns>
public static Phase7CompositionResult ParseComposition(
ReadOnlySpan<byte> blob, string nodeId, Action<string>? onInconsistency = null)
{
var scope = ResolveClusterScope(blob, nodeId);
if (scope.Mode == ClusterFilterMode.None) return ParseComposition(blob);
if (scope.Mode == ClusterFilterMode.Suppress) return Empty();
var full = ParseComposition(blob);
var sets = BuildClusterSets(blob, scope.ClusterId!);
var keptLines = full.UnsLines.Where(l => sets.AreaIds.Contains(l.UnsAreaId)).ToArray();
var keptEquipment = full.EquipmentNodes.Where(e => sets.EquipmentIds.Contains(e.EquipmentId)).ToArray();
if (onInconsistency is not null)
{
var keptLineIds = keptLines.Select(l => l.UnsLineId).ToHashSet(StringComparer.OrdinalIgnoreCase);
foreach (var e in keptEquipment)
{
if (!string.IsNullOrEmpty(e.UnsLineId) && !keptLineIds.Contains(e.UnsLineId))
onInconsistency(
$"equipment '{e.EquipmentId}' is in cluster '{scope.ClusterId}' (by its driver) but its " +
$"UNS line '{e.UnsLineId}' is not — a cross-cluster binding violates the same-cluster " +
"invariant (decision #122) and would orphan the equipment folder.");
}
}
return new Phase7CompositionResult(
full.UnsAreas.Where(a => sets.AreaIds.Contains(a.UnsAreaId)).ToArray(),
keptLines,
keptEquipment,
full.DriverInstancePlans.Where(d => sets.DriverIds.Contains(d.DriverInstanceId)).ToArray(),
full.ScriptedAlarmPlans.Where(a => sets.EquipmentIds.Contains(a.EquipmentId)).ToArray(),
full.GalaxyTags.Where(t => sets.DriverIds.Contains(t.DriverInstanceId)).ToArray())
{
EquipmentTags = full.EquipmentTags.Where(t => sets.DriverIds.Contains(t.DriverInstanceId)).ToArray(),
};
}
/// <summary>The in-cluster id sets used to filter a composition.</summary>
/// <param name="DriverIds">DriverInstanceIds whose row carries the in-scope ClusterId.</param>
/// <param name="AreaIds">UnsAreaIds whose row carries the in-scope ClusterId.</param>
/// <param name="EquipmentIds">EquipmentIds whose owning DriverInstanceId is in-cluster.</param>
private sealed record ClusterSets(HashSet<string> DriverIds, HashSet<string> AreaIds, HashSet<string> EquipmentIds);
/// <summary>Build the in-cluster id sets used to filter a composition: DriverInstanceIds + UnsAreaIds
/// that directly carry the ClusterId, plus EquipmentIds whose DriverInstanceId is in-cluster.</summary>
/// <param name="blob">The deployment artifact blob.</param>
/// <param name="clusterId">The node's ClusterId to scope to.</param>
/// <returns>The resolved in-cluster id sets (empty on parse failure => empty composition).</returns>
private static ClusterSets BuildClusterSets(ReadOnlySpan<byte> blob, string clusterId)
{
var driverIds = new HashSet<string>(StringComparer.Ordinal);
var areaIds = new HashSet<string>(StringComparer.Ordinal);
var equipmentIds = new HashSet<string>(StringComparer.Ordinal);
try
{
using var doc = JsonDocument.Parse(blob.ToArray());
var root = doc.RootElement;
CollectIdsWhereCluster(root, "DriverInstances", "DriverInstanceId", clusterId, driverIds);
CollectIdsWhereCluster(root, "UnsAreas", "UnsAreaId", clusterId, areaIds);
// Equipment carries no ClusterId — include it when its DriverInstanceId is in-cluster.
if (root.TryGetProperty("Equipment", out var eq) && eq.ValueKind == JsonValueKind.Array)
{
foreach (var el in eq.EnumerateArray())
{
if (el.ValueKind != JsonValueKind.Object) continue;
var di = el.TryGetProperty("DriverInstanceId", out var diEl) ? diEl.GetString() : null;
var id = el.TryGetProperty("EquipmentId", out var idEl) ? idEl.GetString() : null;
if (!string.IsNullOrWhiteSpace(id) && di is not null && driverIds.Contains(di))
equipmentIds.Add(id!);
}
}
}
catch (JsonException) { /* empty sets => nothing matches => empty composition */ }
return new ClusterSets(driverIds, areaIds, equipmentIds);
}
/// <summary>Collect each row's <paramref name="idField"/> value from <paramref name="arrayName"/> whose
/// ClusterId equals <paramref name="clusterId"/> (case-insensitive, matching the codebase convention).</summary>
/// <param name="root">The artifact root element.</param>
/// <param name="arrayName">The array property name to scan (e.g. "DriverInstances").</param>
/// <param name="idField">The id field to collect from each in-cluster row.</param>
/// <param name="clusterId">The ClusterId rows must match.</param>
/// <param name="into">The set to collect matching ids into.</param>
private static void CollectIdsWhereCluster(
JsonElement root, string arrayName, string idField, string clusterId, HashSet<string> into)
{
if (!root.TryGetProperty(arrayName, out var arr) || arr.ValueKind != JsonValueKind.Array) return;
foreach (var el in arr.EnumerateArray())
{
if (el.ValueKind != JsonValueKind.Object) continue;
var cid = el.TryGetProperty("ClusterId", out var cEl) ? cEl.GetString() : null;
if (!string.Equals(cid, clusterId, StringComparison.OrdinalIgnoreCase)) continue;
var id = el.TryGetProperty(idField, out var idEl) ? idEl.GetString() : null;
if (!string.IsNullOrWhiteSpace(id)) into.Add(id!);
}
}
private static Phase7CompositionResult Empty() => new(
Array.Empty<UnsAreaProjection>(),
Array.Empty<UnsLineProjection>(),
@@ -364,7 +364,7 @@ public sealed class DriverHostActor : ReceiveActor, IWithTimers
return;
}
var specs = DeploymentArtifact.ParseDriverInstances(blob);
var specs = DeploymentArtifact.ParseDriverInstances(blob, _localNode.Value);
var snapshots = _children.ToDictionary(
kv => kv.Key,
kv => new DriverChildSnapshot(kv.Value.DriverType, kv.Value.LastConfigJson),
@@ -429,7 +429,7 @@ public sealed class DriverHostActor : ReceiveActor, IWithTimers
Phase7CompositionResult composition;
try
{
composition = DeploymentArtifact.ParseComposition(blob);
composition = DeploymentArtifact.ParseComposition(blob, _localNode.Value);
}
catch (Exception ex)
{
@@ -209,7 +209,10 @@ public sealed class OpcUaPublishActor : ReceiveActor
var artifact = msg.DeploymentId is { } depId
? LoadArtifact(depId)
: LoadLatestArtifact();
var composition = DeploymentArtifact.ParseComposition(artifact);
var composition = _localNode is { } ln
? DeploymentArtifact.ParseComposition(artifact, ln.Value,
inconsistency => _log.Warning("OpcUaPublish {Node}: cross-cluster binding — {Message}", ln, inconsistency))
: DeploymentArtifact.ParseComposition(artifact);
var plan = Phase7Planner.Compute(_lastApplied, composition);
if (plan.IsEmpty)
@@ -0,0 +1,187 @@
using Microsoft.EntityFrameworkCore;
using Microsoft.Extensions.DependencyInjection;
using Shouldly;
using Xunit;
using ZB.MOM.WW.OtOpcUa.Commons.Interfaces;
using ZB.MOM.WW.OtOpcUa.Commons.Messages.Admin;
using ZB.MOM.WW.OtOpcUa.Commons.Types;
using ZB.MOM.WW.OtOpcUa.Configuration;
using ZB.MOM.WW.OtOpcUa.Configuration.Entities;
using ZB.MOM.WW.OtOpcUa.Configuration.Enums;
namespace ZB.MOM.WW.OtOpcUa.Host.IntegrationTests;
/// <summary>
/// End-to-end multi-cluster scoping over the real 2-node Akka cluster: two driver nodes are bound
/// to DIFFERENT logical clusters (MAIN + SITE-A) via their <see cref="ClusterNode"/> rows, then a
/// SINGLE deployment is composed + broadcast through the real
/// <c>AdminOperationsActor → ConfigPublishCoordinator → DriverHostActor</c> path.
///
/// <para>This proves the full deploy path applies per-ClusterId scoping that the actor-level test
/// (<c>DriverHostActor_spawns_only_its_clusters_drivers</c>) covers in isolation: the node in MAIN
/// ends up hosting ONLY the MAIN driver, the node in SITE-A ONLY the SITE-A driver, and the
/// deployment still CONVERGES (seals) even though each node applies only a 1-driver slice — the ack
/// fires unconditionally regardless of slice size.</para>
///
/// <para>The harness wires <c>NullDriverFactory</c> (no real transports), so each spawned driver is
/// stubbed; we assert presence + identity (not connectivity) via
/// <see cref="IFleetDiagnosticsClient"/>, the same cross-node Ask path
/// <c>FleetDiagnosticsRoundTripTests</c> uses.</para>
/// </summary>
public sealed class MultiClusterScopingTests
{
private const string MainCluster = "MAIN";
private const string SiteACluster = "SITE-A";
private const string MainDriverId = "main-modbus";
private const string SiteADriverId = "sa-modbus";
private static CancellationToken Ct => TestContext.Current.CancellationToken;
/// <summary>
/// Verifies a single deploy scopes drivers per node: MAIN's node hosts only the MAIN driver,
/// SITE-A's node only the SITE-A driver, and both nodes reach Applied so the deployment seals.
/// </summary>
[Fact]
public async Task Deploy_scopes_drivers_to_each_nodes_own_cluster()
{
await using var harness = await TwoNodeClusterHarness.StartAsync();
// Assign each cluster node to a DIFFERENT logical cluster. NodeId MUST equal the node's
// ClusterRoleInfo.LocalNode (host:port) so the artifact's Nodes[] map resolves each node's
// ClusterId at apply time.
await SeedTwoClusterConfigAsync(harness, mainNodeId: harness.NodeANodeId, siteANodeId: harness.NodeBNodeId);
await using var scope = harness.NodeA.Services.CreateAsyncScope();
var adminOps = scope.ServiceProvider.GetRequiredService<IAdminOperationsClient>();
var result = await adminOps.StartDeploymentAsync(createdBy: "alice@test", Ct);
result.Outcome.ShouldBe(StartDeploymentOutcome.Accepted);
var deploymentId = result.DeploymentId!.Value.Value;
// Convergence: both nodes ack Applied and the coordinator seals — even though each node
// applied only a 1-driver slice of the 2-cluster artifact.
await WaitForAsync(async () =>
{
await using var db = await harness.CreateConfigDbContextAsync();
var d = await db.Deployments.AsNoTracking()
.FirstOrDefaultAsync(d => d.DeploymentId == deploymentId, Ct);
return d?.Status == DeploymentStatus.Sealed;
}, TimeSpan.FromSeconds(20));
await using (var db = await harness.CreateConfigDbContextAsync())
{
var nodeStates = await db.NodeDeploymentStates.AsNoTracking()
.Where(s => s.DeploymentId == deploymentId)
.ToListAsync(Ct);
nodeStates.Count.ShouldBe(2);
nodeStates.ShouldAllBe(s => s.Status == NodeDeploymentStatus.Applied);
}
// Per-node driver presence: each DriverHostActor spawned ONLY its own cluster's slice.
// Poll (rather than single-shot Ask) so transient timing after Sealed doesn't flake.
var diagnostics = scope.ServiceProvider.GetRequiredService<IFleetDiagnosticsClient>();
string[] mainDrivers = [];
await WaitForAsync(async () =>
{
mainDrivers = await GetDriverNamesAsync(diagnostics, harness.NodeANodeId);
return mainDrivers.SequenceEqual(new[] { MainDriverId });
}, TimeSpan.FromSeconds(10));
mainDrivers.ShouldBe(new[] { MainDriverId });
string[] siteADrivers = [];
await WaitForAsync(async () =>
{
siteADrivers = await GetDriverNamesAsync(diagnostics, harness.NodeBNodeId);
return siteADrivers.SequenceEqual(new[] { SiteADriverId });
}, TimeSpan.FromSeconds(10));
siteADrivers.ShouldBe(new[] { SiteADriverId });
}
/// <summary>Asks a node's DriverHostActor (over the cluster) for the names of its spawned drivers.</summary>
private static async Task<string[]> GetDriverNamesAsync(IFleetDiagnosticsClient diagnostics, string nodeId)
{
var snapshot = await diagnostics.GetDiagnosticsAsync(NodeId.Parse(nodeId), Ct);
return snapshot.Drivers.Select(d => d.Name).OrderBy(n => n, StringComparer.Ordinal).ToArray();
}
/// <summary>
/// Seeds two single-node clusters (MAIN, SITE-A), one <see cref="ClusterNode"/> per cluster bound
/// to the supplied <c>host:port</c> identities, and one <see cref="DriverInstance"/> per cluster.
/// <see cref="ConfigComposer.SnapshotAndFlattenAsync"/> emits these straight into the artifact's
/// <c>Clusters</c> / <c>Nodes</c> / <c>DriverInstances</c> arrays, giving each DriverHostActor the
/// multi-cluster artifact <c>DeploymentArtifact.ResolveClusterScope</c> filters by NodeId.
/// </summary>
private static async Task SeedTwoClusterConfigAsync(TwoNodeClusterHarness harness, string mainNodeId, string siteANodeId)
{
await using var db = await harness.CreateConfigDbContextAsync();
db.ServerClusters.AddRange(
NewCluster(MainCluster, "Main Cluster", "central"),
NewCluster(SiteACluster, "Site A Cluster", "site-a"));
db.Namespaces.AddRange(
NewNamespace(MainCluster, "MAIN-equipment", "urn:zb:central:equipment"),
NewNamespace(SiteACluster, "SITE-A-equipment", "urn:zb:site-a:equipment"));
db.ClusterNodes.AddRange(
NewNode(mainNodeId, MainCluster, "urn:zb:central:node-a"),
NewNode(siteANodeId, SiteACluster, "urn:zb:site-a:node-b"));
db.DriverInstances.AddRange(
NewDriver(MainDriverId, MainCluster, "MAIN-equipment"),
NewDriver(SiteADriverId, SiteACluster, "SITE-A-equipment"));
await db.SaveChangesAsync(Ct);
}
private static ServerCluster NewCluster(string clusterId, string name, string site) => new()
{
ClusterId = clusterId,
Name = name,
Enterprise = "zb",
Site = site,
NodeCount = 1,
RedundancyMode = RedundancyMode.None,
CreatedBy = "test",
};
private static Namespace NewNamespace(string clusterId, string namespaceId, string uri) => new()
{
NamespaceId = namespaceId,
ClusterId = clusterId,
Kind = NamespaceKind.Equipment,
NamespaceUri = uri,
};
private static ClusterNode NewNode(string nodeId, string clusterId, string applicationUri) => new()
{
NodeId = nodeId,
ClusterId = clusterId,
Host = TwoNodeClusterHarness.LoopbackHost,
ApplicationUri = applicationUri,
CreatedBy = "test",
};
private static DriverInstance NewDriver(string driverInstanceId, string clusterId, string namespaceId) => new()
{
DriverInstanceId = driverInstanceId,
ClusterId = clusterId,
NamespaceId = namespaceId,
Name = driverInstanceId,
DriverType = "ModbusTcp",
Enabled = true,
DriverConfig = "{}",
};
private static async Task WaitForAsync(Func<Task<bool>> condition, TimeSpan timeout)
{
var deadline = DateTime.UtcNow + timeout;
while (DateTime.UtcNow < deadline)
{
if (await condition()) return;
await Task.Delay(200);
}
throw new TimeoutException($"Condition not met within {timeout}");
}
}
@@ -90,6 +90,23 @@ public sealed class TwoNodeClusterHarness : IAsyncDisposable
/// <summary>Gets the Akka ActorSystem for node B.</summary>
public ActorSystem NodeBSystem => NodeB.Services.GetRequiredService<ActorSystem>();
/// <summary>
/// The <c>host:port</c> identity each node's <c>DriverHostActor</c> / <c>NodeDeploymentState</c>
/// row uses, derived the same way <c>ClusterRoleInfo</c> derives <c>LocalNode</c> from
/// <c>Cluster:PublicHostname</c> + <c>Cluster:Port</c>. Seed a <c>ClusterNode.NodeId</c> with this
/// value to bind a node to a logical ClusterId for multi-cluster scoping tests.
/// </summary>
public string NodeANodeId => $"{LoopbackHost}:{NodeAAkkaPort}";
/// <inheritdoc cref="NodeANodeId"/>
public string NodeBNodeId => $"{LoopbackHost}:{NodeBAkkaPort}";
/// <summary>Opens a new <see cref="OtOpcUaConfigDbContext"/> over the shared ConfigDb (the same
/// store both nodes read) so a test can seed clusters/nodes/drivers before triggering a deploy.</summary>
/// <returns>A new DbContext the caller is responsible for disposing.</returns>
public Task<OtOpcUaConfigDbContext> CreateConfigDbContextAsync()
=> NodeA.Services.GetRequiredService<IDbContextFactory<OtOpcUaConfigDbContext>>().CreateDbContextAsync();
/// <summary>Boots both nodes and waits up to <paramref name="formationTimeout"/> for cluster convergence.</summary>
/// <param name="formationTimeout">Maximum time to wait for cluster formation; defaults to 20 seconds if not provided.</param>
public static async Task<TwoNodeClusterHarness> StartAsync(TimeSpan? formationTimeout = null)
@@ -1,3 +1,4 @@
using System.Linq;
using System.Text;
using System.Text.Json;
using Shouldly;
@@ -8,6 +9,78 @@ namespace ZB.MOM.WW.OtOpcUa.Runtime.Tests.Drivers;
public sealed class DeploymentArtifactTests
{
private static byte[] BlobOf(object snapshot) =>
System.Text.Json.JsonSerializer.SerializeToUtf8Bytes(snapshot);
private static object MultiClusterSnapshot() => new
{
Clusters = new[] { new { ClusterId = "MAIN" }, new { ClusterId = "SITE-A" } },
Nodes = new[]
{
new { NodeId = "central-1:4053", ClusterId = "MAIN" },
new { NodeId = "site-a-1:4053", ClusterId = "SITE-A" },
},
DriverInstances = new[]
{
new { DriverInstanceRowId = Guid.NewGuid(), DriverInstanceId = "main-galaxy", Name = "g", DriverType = "GalaxyMxGateway", Enabled = true, DriverConfig = "{}", ClusterId = "MAIN", NamespaceId = "main-ns" },
new { DriverInstanceRowId = Guid.NewGuid(), DriverInstanceId = "sa-modbus", Name = "m", DriverType = "Modbus", Enabled = true, DriverConfig = "{}", ClusterId = "SITE-A", NamespaceId = "sa-ns" },
},
};
/// <summary>Verifies a single-cluster artifact resolves to None (apply everything).</summary>
[Fact]
public void ResolveClusterScope_single_cluster_artifact_returns_None()
{
var blob = BlobOf(new { Clusters = new[] { new { ClusterId = "MAIN" } }, Nodes = Array.Empty<object>() });
var scope = DeploymentArtifact.ResolveClusterScope(blob, "central-1:4053");
scope.Mode.ShouldBe(ClusterFilterMode.None);
}
/// <summary>Verifies a multi-cluster artifact scopes a known node to its own ClusterId.</summary>
[Fact]
public void ResolveClusterScope_multi_cluster_known_node_scopes_to_its_cluster()
{
var scope = DeploymentArtifact.ResolveClusterScope(BlobOf(MultiClusterSnapshot()), "site-a-1:4053");
scope.Mode.ShouldBe(ClusterFilterMode.ScopeTo);
scope.ClusterId.ShouldBe("SITE-A");
}
/// <summary>Verifies a multi-cluster artifact suppresses an unknown node.</summary>
[Fact]
public void ResolveClusterScope_multi_cluster_unknown_node_suppresses()
{
var scope = DeploymentArtifact.ResolveClusterScope(BlobOf(MultiClusterSnapshot()), "ghost-9:4053");
scope.Mode.ShouldBe(ClusterFilterMode.Suppress);
}
/// <summary>Verifies the scoped parse returns only the node's own cluster's drivers.</summary>
[Fact]
public void ParseDriverInstances_scoped_returns_only_my_clusters_drivers()
{
var specs = DeploymentArtifact.ParseDriverInstances(BlobOf(MultiClusterSnapshot()), "central-1:4053");
specs.Select(s => s.DriverInstanceId).ShouldBe(new[] { "main-galaxy" });
}
/// <summary>Verifies the scoped parse returns nothing for an unknown node.</summary>
[Fact]
public void ParseDriverInstances_scoped_unknown_node_returns_empty()
{
var specs = DeploymentArtifact.ParseDriverInstances(BlobOf(MultiClusterSnapshot()), "ghost-9:4053");
specs.ShouldBeEmpty();
}
/// <summary>Verifies the scoped parse returns all drivers for a single-cluster artifact.</summary>
[Fact]
public void ParseDriverInstances_scoped_single_cluster_returns_all()
{
var blob = BlobOf(new
{
Clusters = new[] { new { ClusterId = "MAIN" } },
Nodes = new[] { new { NodeId = "n1:4053", ClusterId = "MAIN" } },
DriverInstances = new[] { new { DriverInstanceRowId = Guid.NewGuid(), DriverInstanceId = "d1", Name = "d", DriverType = "Modbus", Enabled = true, DriverConfig = "{}", ClusterId = "MAIN" } },
});
DeploymentArtifact.ParseDriverInstances(blob, "anything:4053").Select(s => s.DriverInstanceId).ShouldBe(new[] { "d1" });
}
/// <summary>Verifies that empty blob returns empty list.</summary>
[Fact]
public void Empty_blob_returns_empty_list()
@@ -222,4 +295,145 @@ public sealed class DeploymentArtifactTests
specs.Single().DriverInstanceId.ShouldBe("DI-ok");
}
/// <summary>Verifies that a malformed blob resolves to None rather than throwing.</summary>
[Fact]
public void ResolveClusterScope_malformed_blob_returns_None()
{
var scope = DeploymentArtifact.ResolveClusterScope("not json"u8.ToArray(), "central-1:4053");
scope.Mode.ShouldBe(ClusterFilterMode.None);
}
/// <summary>Verifies that a blank ClusterId in the node row resolves to Suppress.</summary>
[Fact]
public void ResolveClusterScope_blank_cluster_id_suppresses()
{
var blob = BlobOf(new
{
Clusters = new[] { new { ClusterId = "MAIN" }, new { ClusterId = "SITE-A" } },
Nodes = new[] { new { NodeId = "central-1:4053", ClusterId = "" } },
});
DeploymentArtifact.ResolveClusterScope(blob, "central-1:4053").Mode.ShouldBe(ClusterFilterMode.Suppress);
}
/// <summary>Verifies that NodeId matching in ResolveClusterScope is case-insensitive.</summary>
[Fact]
public void ResolveClusterScope_node_id_match_is_case_insensitive()
{
var blob = BlobOf(new
{
Clusters = new[] { new { ClusterId = "MAIN" }, new { ClusterId = "SITE-A" } },
Nodes = new[] { new { NodeId = "Central-1:4053", ClusterId = "MAIN" } },
});
var scope = DeploymentArtifact.ResolveClusterScope(blob, "central-1:4053");
scope.Mode.ShouldBe(ClusterFilterMode.ScopeTo);
scope.ClusterId.ShouldBe("MAIN");
}
private static object MultiClusterSnapshotWithTags() => new
{
Clusters = new[] { new { ClusterId = "MAIN" }, new { ClusterId = "SITE-A" } },
Nodes = new[]
{
new { NodeId = "central-1:4053", ClusterId = "MAIN" },
new { NodeId = "site-a-1:4053", ClusterId = "SITE-A" },
},
DriverInstances = new[]
{
new { DriverInstanceId = "main-galaxy", DriverType = "GalaxyMxGateway", DriverConfig = "{}", ClusterId = "MAIN", NamespaceId = "main-ns" },
new { DriverInstanceId = "sa-galaxy", DriverType = "GalaxyMxGateway", DriverConfig = "{}", ClusterId = "SITE-A", NamespaceId = "sa-ns" },
},
Namespaces = new[]
{
new { NamespaceId = "main-ns", ClusterId = "MAIN", Kind = 1 },
new { NamespaceId = "sa-ns", ClusterId = "SITE-A", Kind = 1 },
},
Tags = new[]
{
new { TagId = "t-main", DriverInstanceId = "main-galaxy", EquipmentId = (string?)null, Name = "M1", FolderPath = "F", DataType = "Boolean", TagConfig = "{}" },
new { TagId = "t-sa", DriverInstanceId = "sa-galaxy", EquipmentId = (string?)null, Name = "S1", FolderPath = "F", DataType = "Boolean", TagConfig = "{}" },
},
};
[Fact]
public void ParseComposition_scoped_keeps_only_my_clusters_drivers_and_tags()
{
var blob = BlobOf(MultiClusterSnapshotWithTags());
var main = DeploymentArtifact.ParseComposition(blob, "central-1:4053");
main.DriverInstancePlans.Select(d => d.DriverInstanceId).ShouldBe(new[] { "main-galaxy" });
main.GalaxyTags.Select(t => t.TagId).ShouldBe(new[] { "t-main" });
var siteA = DeploymentArtifact.ParseComposition(blob, "site-a-1:4053");
siteA.DriverInstancePlans.Select(d => d.DriverInstanceId).ShouldBe(new[] { "sa-galaxy" });
siteA.GalaxyTags.Select(t => t.TagId).ShouldBe(new[] { "t-sa" });
}
[Fact]
public void ParseComposition_scoped_unknown_node_is_empty()
{
var comp = DeploymentArtifact.ParseComposition(BlobOf(MultiClusterSnapshotWithTags()), "ghost-9:4053");
comp.GalaxyTags.ShouldBeEmpty();
comp.DriverInstancePlans.ShouldBeEmpty();
}
[Fact]
public void ParseComposition_single_cluster_node_id_overload_matches_legacy()
{
var blob = BlobOf(new
{
Clusters = new[] { new { ClusterId = "MAIN" } },
Nodes = new[] { new { NodeId = "n1:4053", ClusterId = "MAIN" } },
DriverInstances = new[] { new { DriverInstanceId = "d1", DriverType = "Modbus", DriverConfig = "{}", ClusterId = "MAIN", NamespaceId = "ns" } },
});
DeploymentArtifact.ParseComposition(blob, "anything:4053").DriverInstancePlans.Count
.ShouldBe(DeploymentArtifact.ParseComposition(blob).DriverInstancePlans.Count);
}
/// <summary>An artifact where an equipment's driver is in the node's cluster but its UNS line's area
/// is in another cluster: <paramref name="areaCluster"/> controls the area's ClusterId.</summary>
private static byte[] OrphanEquipmentBlob(string areaCluster) => BlobOf(new
{
Clusters = new[] { new { ClusterId = "MAIN" }, new { ClusterId = "SITE-A" } },
Nodes = new[] { new { NodeId = "central-1:4053", ClusterId = "MAIN" } },
DriverInstances = new[]
{
new { DriverInstanceId = "main-driver", DriverType = "Modbus", DriverConfig = "{}", ClusterId = "MAIN", NamespaceId = "main-ns" },
},
UnsAreas = new[] { new { UnsAreaId = "area-1", Name = "Area1", ClusterId = areaCluster } },
UnsLines = new[] { new { UnsLineId = "line-1", UnsAreaId = "area-1", Name = "Line1" } },
Equipment = new[]
{
new { EquipmentId = "equip-1", Name = "Equip1", UnsLineId = "line-1", DriverInstanceId = "main-driver" },
},
});
/// <summary>Verifies the inconsistency callback fires when a kept equipment's UNS line belongs to
/// another cluster (a cross-cluster orphan binding).</summary>
[Fact]
public void ParseComposition_scoped_flags_cross_cluster_orphan_equipment()
{
var warnings = new List<string>();
var comp = DeploymentArtifact.ParseComposition(
OrphanEquipmentBlob(areaCluster: "SITE-A"), "central-1:4053", warnings.Add);
comp.EquipmentNodes.Select(e => e.EquipmentId).ShouldBe(new[] { "equip-1" }); // still returned
warnings.Count.ShouldBe(1);
warnings[0].ShouldContain("equip-1");
warnings[0].ShouldContain("line-1");
}
/// <summary>Verifies the inconsistency callback does NOT fire when the equipment, its driver, and its
/// UNS line/area are all in the same cluster (the normal, invariant-respecting case).</summary>
[Fact]
public void ParseComposition_scoped_consistent_equipment_does_not_warn()
{
var warnings = new List<string>();
var comp = DeploymentArtifact.ParseComposition(
OrphanEquipmentBlob(areaCluster: "MAIN"), "central-1:4053", warnings.Add);
comp.EquipmentNodes.Select(e => e.EquipmentId).ShouldBe(new[] { "equip-1" });
comp.UnsLines.Select(l => l.UnsLineId).ShouldBe(new[] { "line-1" });
warnings.ShouldBeEmpty();
}
}
@@ -132,6 +132,86 @@ public sealed class DriverHostActorReconcileTests : RuntimeActorTestBase
snap.Drivers.Count.ShouldBe(1);
}
/// <summary>
/// Verifies per-ClusterId scoping at the actor level: a 2-cluster artifact (MAIN + SITE-A,
/// one driver each) dispatched to a node whose ClusterNode row puts it in SITE-A spawns ONLY
/// the SITE-A driver — and the node still reaches Applied (the ack fires unconditionally even
/// when a node's cluster slice is empty).
/// </summary>
[Fact]
public void DriverHostActor_spawns_only_its_clusters_drivers()
{
var db = NewInMemoryDbFactory();
var factory = new CountingDriverFactory("Modbus");
// Both drivers are Modbus so the factory could create either — scoping, not type support,
// must be what excludes the MAIN driver.
var deploymentId = SeedMultiClusterDeployment(db, RevA,
("main-modbus", "Modbus", "{}", true, "MAIN"),
("sa-modbus", "Modbus", "{}", true, "SITE-A"));
// This node belongs to SITE-A per the Nodes (ClusterNode) rows.
var siteANode = NodeId.Parse("site-a-1:4053");
var coordinator = CreateTestProbe();
var actor = Sys.ActorOf(DriverHostActor.Props(
db, siteANode, coordinator.Ref,
driverFactory: factory,
localRoles: new HashSet<string> { "driver" }));
actor.Tell(new DispatchDeployment(deploymentId, RevA, CorrelationId.NewId()));
// The node still reaches Applied even though it hosts only its own cluster's slice.
coordinator.ExpectMsg<ApplyAck>(TimeSpan.FromSeconds(5)).Outcome.ShouldBe(ApplyAckOutcome.Applied);
// Only the SITE-A driver was constructed — the MAIN driver was filtered out by scoping.
AwaitAssert(() => factory.CreateCount.ShouldBe(1), duration: TimeSpan.FromSeconds(3));
actor.Tell(new GetDiagnostics(CorrelationId.NewId()), coordinator.Ref);
var snap = coordinator.ExpectMsg<Commons.Interfaces.NodeDiagnosticsSnapshot>(TimeSpan.FromSeconds(2));
snap.Drivers.Count.ShouldBe(1);
snap.Drivers[0].Name.ShouldBe("sa-modbus");
}
private static DeploymentId SeedMultiClusterDeployment(
IDbContextFactory<OtOpcUaConfigDbContext> db,
RevisionHash rev,
params (string Id, string Type, string Config, bool Enabled, string ClusterId)[] drivers)
{
var artifact = JsonSerializer.SerializeToUtf8Bytes(new
{
// >1 cluster + matching Nodes rows triggers ScopeTo (single-cluster would resolve to None).
Clusters = new[] { new { ClusterId = "MAIN" }, new { ClusterId = "SITE-A" } },
Nodes = new[]
{
new { NodeId = "central-1:4053", ClusterId = "MAIN" },
new { NodeId = "site-a-1:4053", ClusterId = "SITE-A" },
},
DriverInstances = drivers.Select(d => new
{
DriverInstanceRowId = Guid.NewGuid(),
DriverInstanceId = d.Id,
Name = d.Id,
DriverType = d.Type,
Enabled = d.Enabled,
DriverConfig = d.Config,
ClusterId = d.ClusterId,
}).ToArray(),
});
var id = DeploymentId.NewId();
using var ctx = db.CreateDbContext();
ctx.Deployments.Add(new Deployment
{
DeploymentId = id.Value,
RevisionHash = rev.Value,
Status = DeploymentStatus.Sealed,
CreatedBy = "test",
SealedAtUtc = DateTime.UtcNow,
ArtifactBlob = artifact,
});
ctx.SaveChanges();
return id;
}
private static DeploymentId SeedDeploymentWithDrivers(
IDbContextFactory<OtOpcUaConfigDbContext> db,
RevisionHash rev,
@@ -98,6 +98,104 @@ public sealed class OpcUaPublishActorRebuildTests : RuntimeActorTestBase
AwaitAssert(() => sink.RebuildCalls.ShouldBe(1), duration: TimeSpan.FromMilliseconds(500));
}
/// <summary>
/// Wiring proof for per-ClusterId scoping (Task 4): a multi-cluster artifact must
/// materialise ONLY the local node's cluster slice. Mirrors the multi-cluster artifact
/// shape exercised in <c>DeploymentArtifactTests</c> (MAIN + SITE-A, one Galaxy driver +
/// one SystemPlatform tag each). The scoped rebuild for the SITE-A node must surface the
/// SITE-A tag (<c>t-sa</c> → variable <c>F.S1</c>) and NOT MAIN's (<c>t-main</c> →
/// <c>F.M1</c>); the mirror holds for the MAIN node. Without the production scoping edit,
/// the unscoped parse would materialise BOTH variables on every node.
/// </summary>
[Fact]
public void Rebuild_materialises_only_the_nodes_cluster()
{
// --- SITE-A node: only the SITE-A tag's variable, never MAIN's. ---
var dbA = NewInMemoryDbFactory();
var sinkA = new RecordingSink();
var applierA = new Phase7Applier(sinkA, NullLogger<Phase7Applier>.Instance);
SeedMultiClusterDeployment(dbA);
var siteActor = Sys.ActorOf(OpcUaPublishActor.PropsForTests(
sink: sinkA,
dbFactory: dbA,
applier: applierA,
localNode: NodeId.Parse("site-a-1:4053")));
siteActor.Tell(new OpcUaPublishActor.RebuildAddressSpace(CorrelationId.NewId()));
AwaitAssert(() => sinkA.RebuildCalls.ShouldBe(1), duration: TimeSpan.FromSeconds(2));
// t-sa (Name "S1", FolderPath "F") → MxAccessRef "F.S1" → variable node "F.S1".
sinkA.Calls.ShouldContain("EV:F.S1");
// t-main (MAIN cluster) must NOT leak onto the SITE-A node.
sinkA.Calls.ShouldNotContain("EV:F.M1");
// --- MAIN node: the mirror — only MAIN's tag's variable, never SITE-A's. ---
var dbM = NewInMemoryDbFactory();
var sinkM = new RecordingSink();
var applierM = new Phase7Applier(sinkM, NullLogger<Phase7Applier>.Instance);
SeedMultiClusterDeployment(dbM);
var mainActor = Sys.ActorOf(OpcUaPublishActor.PropsForTests(
sink: sinkM,
dbFactory: dbM,
applier: applierM,
localNode: NodeId.Parse("central-1:4053")));
mainActor.Tell(new OpcUaPublishActor.RebuildAddressSpace(CorrelationId.NewId()));
AwaitAssert(() => sinkM.RebuildCalls.ShouldBe(1), duration: TimeSpan.FromSeconds(2));
sinkM.Calls.ShouldContain("EV:F.M1");
sinkM.Calls.ShouldNotContain("EV:F.S1");
}
/// <summary>
/// Seal a 2-cluster deployment (MAIN + SITE-A) whose artifact mirrors the multi-cluster
/// shape the composer emits: a <c>Clusters</c> + <c>Nodes</c> map, one SystemPlatform
/// namespace + Galaxy driver + Galaxy tag per cluster. Used by
/// <see cref="Rebuild_materialises_only_the_nodes_cluster"/>.
/// </summary>
private static void SeedMultiClusterDeployment(IDbContextFactory<OtOpcUaConfigDbContext> dbFactory)
{
var artifact = JsonSerializer.SerializeToUtf8Bytes(new
{
Clusters = new[] { new { ClusterId = "MAIN" }, new { ClusterId = "SITE-A" } },
Nodes = new[]
{
new { NodeId = "central-1:4053", ClusterId = "MAIN" },
new { NodeId = "site-a-1:4053", ClusterId = "SITE-A" },
},
DriverInstances = new[]
{
new { DriverInstanceId = "main-galaxy", DriverType = "GalaxyMxGateway", DriverConfig = "{}", ClusterId = "MAIN", NamespaceId = "main-ns" },
new { DriverInstanceId = "sa-galaxy", DriverType = "GalaxyMxGateway", DriverConfig = "{}", ClusterId = "SITE-A", NamespaceId = "sa-ns" },
},
Namespaces = new[]
{
new { NamespaceId = "main-ns", ClusterId = "MAIN", Kind = 1 }, // NamespaceKind.SystemPlatform
new { NamespaceId = "sa-ns", ClusterId = "SITE-A", Kind = 1 },
},
Tags = new[]
{
new { TagId = "t-main", DriverInstanceId = "main-galaxy", EquipmentId = (string?)null, Name = "M1", FolderPath = "F", DataType = "Boolean", TagConfig = "{}" },
new { TagId = "t-sa", DriverInstanceId = "sa-galaxy", EquipmentId = (string?)null, Name = "S1", FolderPath = "F", DataType = "Boolean", TagConfig = "{}" },
},
ScriptedAlarms = Array.Empty<object>(),
});
using var ctx = dbFactory.CreateDbContext();
ctx.Deployments.Add(new Deployment
{
DeploymentId = Guid.NewGuid(),
RevisionHash = new string('b', 64),
Status = DeploymentStatus.Sealed,
CreatedBy = "test",
SealedAtUtc = DateTime.UtcNow,
ArtifactBlob = artifact,
});
ctx.SaveChanges();
}
private static void SeedDeployment(
IDbContextFactory<OtOpcUaConfigDbContext> dbFactory,
string[] equipmentIds,