From 7bec2fd4db5fe8caf8539d5bcf18f86d9163bcd0 Mon Sep 17 00:00:00 2001 From: Joseph Doherty Date: Sun, 7 Jun 2026 02:59:46 -0400 Subject: [PATCH] docs(plan): per-ClusterId scoping implementation plan + task graph 10 tasks: runtime scoping (ResolveClusterScope + scoped ParseDriverInstances/ ParseComposition, DriverHostActor + OpcUaPublishActor wiring, multi-cluster E2E) then docker-dev compose/traefik/seed rewrite, live verification, docs. --- docs/plans/2026-06-07-per-cluster-scoping.md | 825 ++++++++++++++++++ ...26-06-07-per-cluster-scoping.md.tasks.json | 16 + 2 files changed, 841 insertions(+) create mode 100644 docs/plans/2026-06-07-per-cluster-scoping.md create mode 100644 docs/plans/2026-06-07-per-cluster-scoping.md.tasks.json diff --git a/docs/plans/2026-06-07-per-cluster-scoping.md b/docs/plans/2026-06-07-per-cluster-scoping.md new file mode 100644 index 00000000..31cc8fe4 --- /dev/null +++ b/docs/plans/2026-06-07-per-cluster-scoping.md @@ -0,0 +1,825 @@ +# Per-ClusterId Scoping (hub-and-spoke single mesh) Implementation Plan + +> **For Claude:** REQUIRED SUB-SKILL: Use superpowers-extended-cc:executing-plans to implement this plan task-by-task. + +**Goal:** Let one central cluster's Admin UI deploy to multiple logically-separate +clusters that share one Akka mesh, with each node applying only its own +`ClusterId`'s drivers + OPC UA address space. + +**Architecture:** Approach A — node-side, parse-time filtering. Each node resolves +its own `ClusterId` from the deployment artifact's `ClusterNode` rows (no extra DB +query) and filters both the driver specs and the address-space composition to that +cluster. The coordinator stays a single broadcast; every node applies its own +slice and acks. A single-cluster artifact filters to nothing-different, so existing +deployments + tests are unaffected. + +**Tech Stack:** .NET 10, Akka.NET, EF Core, `System.Text.Json` (artifact parse), +xUnit v2 + Shouldly (Runtime.Tests uses Akka.TestKit.Xunit2), Docker Compose + Traefik. + +**Design doc:** `docs/plans/2026-06-07-per-cluster-scoping-design.md` (approved). + +**The fallback rule (single source of truth — implemented once in `ResolveClusterScope`):** +- artifact has **≤1 cluster** → `None` (apply everything; legacy/single-cluster + all existing tests behave identically). +- artifact has **>1 cluster** and the node's `ClusterNode` row is found → `ScopeTo(clusterId)`. +- artifact has **>1 cluster** and the node's row is **not** found → `Suppress` (apply nothing + the caller logs). + +**Hard rules (carry through every task):** never `git add .` — stage by explicit +path; never stage `sql_login.txt` or `src/Server/.../pki/`; never echo the gateway +API key into a *new* tracked file; never force-push or skip hooks. + +--- + +## Task 1: `ResolveClusterScope` + node-scoped `ParseDriverInstances` + +**Classification:** standard +**Estimated implement time:** ~5 min +**Parallelizable with:** Task 6, Task 7, Task 8 + +**Files:** +- Modify: `src/Server/ZB.MOM.WW.OtOpcUa.Runtime/Drivers/DeploymentArtifact.cs` +- Test: `tests/Server/ZB.MOM.WW.OtOpcUa.Runtime.Tests/Drivers/DeploymentArtifactTests.cs` + +**Context:** `DeploymentArtifact` is a static JSON decoder over the artifact blob +produced by `ConfigComposer.SnapshotAndFlattenAsync`. The artifact root has +Pascal-case arrays: `Clusters` (ServerCluster, has `ClusterId`), `Nodes` +(ClusterNode, has `NodeId` + `ClusterId`), `DriverInstances` (has `ClusterId`), +`Namespaces`/`UnsAreas` (have `ClusterId`), `Equipment`/`Tags`/`UnsLines`/`ScriptedAlarms` +(no `ClusterId` — traced via `DriverInstanceId`/`UnsAreaId`/`EquipmentId`). +`DriverInstanceSpec` already carries `ClusterId` (`DeploymentArtifact.cs:19`). + +**Step 1: Write the failing tests** + +Add to `DeploymentArtifactTests.cs`. Reuse the file's existing artifact-blob +helper if present; otherwise add this minimal builder: + +```csharp +private static byte[] BlobOf(object snapshot) => + System.Text.Json.JsonSerializer.SerializeToUtf8Bytes(snapshot); + +private static object MultiClusterSnapshot() => new +{ + Clusters = new[] { new { ClusterId = "MAIN" }, new { ClusterId = "SITE-A" } }, + Nodes = new[] + { + new { NodeId = "central-1:4053", ClusterId = "MAIN" }, + new { NodeId = "site-a-1:4053", ClusterId = "SITE-A" }, + }, + DriverInstances = new[] + { + new { DriverInstanceRowId = Guid.NewGuid(), DriverInstanceId = "main-galaxy", Name = "g", DriverType = "GalaxyMxGateway", Enabled = true, DriverConfig = "{}", ClusterId = "MAIN", NamespaceId = "main-ns" }, + new { DriverInstanceRowId = Guid.NewGuid(), DriverInstanceId = "sa-modbus", Name = "m", DriverType = "Modbus", Enabled = true, DriverConfig = "{}", ClusterId = "SITE-A", NamespaceId = "sa-ns" }, + }, +}; +``` + +```csharp +[Fact] +public void ResolveClusterScope_single_cluster_artifact_returns_None() +{ + var blob = BlobOf(new { Clusters = new[] { new { ClusterId = "MAIN" } }, Nodes = Array.Empty() }); + var scope = DeploymentArtifact.ResolveClusterScope(blob, "central-1:4053"); + scope.Mode.ShouldBe(ClusterFilterMode.None); +} + +[Fact] +public void ResolveClusterScope_multi_cluster_known_node_scopes_to_its_cluster() +{ + var scope = DeploymentArtifact.ResolveClusterScope(BlobOf(MultiClusterSnapshot()), "site-a-1:4053"); + scope.Mode.ShouldBe(ClusterFilterMode.ScopeTo); + scope.ClusterId.ShouldBe("SITE-A"); +} + +[Fact] +public void ResolveClusterScope_multi_cluster_unknown_node_suppresses() +{ + var scope = DeploymentArtifact.ResolveClusterScope(BlobOf(MultiClusterSnapshot()), "ghost-9:4053"); + scope.Mode.ShouldBe(ClusterFilterMode.Suppress); +} + +[Fact] +public void ParseDriverInstances_scoped_returns_only_my_clusters_drivers() +{ + var specs = DeploymentArtifact.ParseDriverInstances(BlobOf(MultiClusterSnapshot()), "central-1:4053"); + specs.Select(s => s.DriverInstanceId).ShouldBe(new[] { "main-galaxy" }); +} + +[Fact] +public void ParseDriverInstances_scoped_unknown_node_returns_empty() +{ + var specs = DeploymentArtifact.ParseDriverInstances(BlobOf(MultiClusterSnapshot()), "ghost-9:4053"); + specs.ShouldBeEmpty(); +} + +[Fact] +public void ParseDriverInstances_scoped_single_cluster_returns_all() +{ + var blob = BlobOf(new + { + Clusters = new[] { new { ClusterId = "MAIN" } }, + Nodes = new[] { new { NodeId = "n1:4053", ClusterId = "MAIN" } }, + DriverInstances = new[] { new { DriverInstanceRowId = Guid.NewGuid(), DriverInstanceId = "d1", Name = "d", DriverType = "Modbus", Enabled = true, DriverConfig = "{}", ClusterId = "MAIN" } }, + }); + DeploymentArtifact.ParseDriverInstances(blob, "anything:4053").Select(s => s.DriverInstanceId).ShouldBe(new[] { "d1" }); +} +``` + +**Step 2: Run the tests — verify they fail** + +Run: `dotnet test tests/Server/ZB.MOM.WW.OtOpcUa.Runtime.Tests --filter "FullyQualifiedName~DeploymentArtifactTests"` +Expected: FAIL — `ClusterFilterMode` / `ResolveClusterScope` / the 2-arg `ParseDriverInstances` don't exist. + +**Step 3: Implement** + +In `DeploymentArtifact.cs`, add the scope types just above `public static class DeploymentArtifact` (top-level, same namespace): + +```csharp +/// How a node should scope a deployment artifact to its own ClusterId. +public enum ClusterFilterMode { None, ScopeTo, Suppress } + +/// Resolved scoping decision for a node against an artifact. +/// None = apply everything (single-cluster / legacy); ScopeTo = filter to ; Suppress = apply nothing. +/// The node's ClusterId when is ScopeTo; otherwise null. +public readonly record struct ClusterScope(ClusterFilterMode Mode, string? ClusterId); +``` + +Inside the class, add `ResolveClusterScope` and the 2-arg `ParseDriverInstances` +overload (place after the existing `ParseDriverInstances`): + +```csharp +/// +/// Resolve how a node should scope a multi-cluster deployment artifact to its own logical +/// cluster, from the same consistent snapshot it applies (the artifact's ClusterNode rows map +/// NodeId → ClusterId; the ServerCluster count decides single- vs multi-cluster). Fallback rule: +/// ≤1 cluster ⇒ no filter (legacy single-cluster meshes + existing tests unchanged); >1 cluster +/// with the node's row found ⇒ scope to that ClusterId; >1 cluster with the row missing ⇒ +/// suppress (apply nothing) — a node in a multi-cluster mesh with no ClusterNode row is +/// misconfigured and must not serve other clusters' data. +/// +/// The deployment artifact blob. +/// This node's identity in "host:port" form (matches ClusterNode.NodeId). +/// The scoping decision for this node. +public static ClusterScope ResolveClusterScope(ReadOnlySpan blob, string nodeId) +{ + if (blob.IsEmpty) return new ClusterScope(ClusterFilterMode.None, null); + try + { + using var doc = JsonDocument.Parse(blob.ToArray()); + var root = doc.RootElement; + var clusterCount = root.TryGetProperty("Clusters", out var cl) && cl.ValueKind == JsonValueKind.Array + ? cl.GetArrayLength() : 0; + if (clusterCount <= 1) return new ClusterScope(ClusterFilterMode.None, null); + + string? myCluster = null; + if (root.TryGetProperty("Nodes", out var nodes) && nodes.ValueKind == JsonValueKind.Array) + { + foreach (var el in nodes.EnumerateArray()) + { + if (el.ValueKind != JsonValueKind.Object) continue; + var nid = el.TryGetProperty("NodeId", out var nEl) ? nEl.GetString() : null; + if (!string.Equals(nid, nodeId, StringComparison.Ordinal)) continue; + myCluster = el.TryGetProperty("ClusterId", out var cEl) ? cEl.GetString() : null; + break; + } + } + return string.IsNullOrWhiteSpace(myCluster) + ? new ClusterScope(ClusterFilterMode.Suppress, null) + : new ClusterScope(ClusterFilterMode.ScopeTo, myCluster); + } + catch (JsonException) + { + return new ClusterScope(ClusterFilterMode.None, null); + } +} + +/// Cluster-scoped overload: the driver specs a node should host given its NodeId. +/// The deployment artifact blob. +/// This node's identity in "host:port" form. +/// The filtered driver specs per the node's decision. +public static IReadOnlyList ParseDriverInstances(ReadOnlySpan blob, string nodeId) +{ + var scope = ResolveClusterScope(blob, nodeId); + var all = ParseDriverInstances(blob); + return scope.Mode switch + { + ClusterFilterMode.Suppress => Array.Empty(), + ClusterFilterMode.ScopeTo => all.Where( + s => string.Equals(s.ClusterId, scope.ClusterId, StringComparison.Ordinal)).ToArray(), + _ => all, + }; +} +``` + +**Step 4: Run the tests — verify they pass** + +Run: `dotnet test tests/Server/ZB.MOM.WW.OtOpcUa.Runtime.Tests --filter "FullyQualifiedName~DeploymentArtifactTests"` +Expected: PASS (new + all pre-existing DeploymentArtifact tests). + +**Step 5: Commit** + +```bash +git add src/Server/ZB.MOM.WW.OtOpcUa.Runtime/Drivers/DeploymentArtifact.cs \ + tests/Server/ZB.MOM.WW.OtOpcUa.Runtime.Tests/Drivers/DeploymentArtifactTests.cs +git commit -m "feat(runtime): ClusterId scope resolution + node-scoped driver-spec parse" +``` + +**Acceptance:** `ResolveClusterScope` implements the 3-branch rule; the scoped +`ParseDriverInstances` filters per the rule; the no-arg overload is untouched. + +--- + +## Task 2: Node-scoped `ParseComposition` (address-space filter) + +**Classification:** standard +**Estimated implement time:** ~5 min +**Parallelizable with:** Task 6, Task 7, Task 8 + +**Blocked by:** Task 1 (uses `ClusterScope` / `ResolveClusterScope`, same file). + +**Files:** +- Modify: `src/Server/ZB.MOM.WW.OtOpcUa.Runtime/Drivers/DeploymentArtifact.cs` +- Test: `tests/Server/ZB.MOM.WW.OtOpcUa.Runtime.Tests/Drivers/DeploymentArtifactTests.cs` + +**Context:** `ParseComposition(blob)` returns a `Phase7CompositionResult` whose +projections carry no `ClusterId`. Filter by building in-cluster id sets from the +raw artifact: `DriverInstanceId`s and `UnsAreaId`s whose row's `ClusterId` matches, +plus `EquipmentId`s whose `DriverInstanceId` is in-cluster. Then filter each +projection (areas by `UnsAreaId`, lines by `UnsAreaId`, equipment by `EquipmentId`, +drivers/galaxyTags/equipmentTags by `DriverInstanceId`, alarms by `EquipmentId`). + +**Step 1: Write the failing tests** + +Extend `MultiClusterSnapshot()` from Task 1 with namespaces + tags so galaxy-tag +filtering is exercised, then add the test: + +```csharp +private static object MultiClusterSnapshotWithTags() => new +{ + Clusters = new[] { new { ClusterId = "MAIN" }, new { ClusterId = "SITE-A" } }, + Nodes = new[] + { + new { NodeId = "central-1:4053", ClusterId = "MAIN" }, + new { NodeId = "site-a-1:4053", ClusterId = "SITE-A" }, + }, + DriverInstances = new[] + { + new { DriverInstanceId = "main-galaxy", DriverType = "GalaxyMxGateway", DriverConfig = "{}", ClusterId = "MAIN", NamespaceId = "main-ns" }, + new { DriverInstanceId = "sa-galaxy", DriverType = "GalaxyMxGateway", DriverConfig = "{}", ClusterId = "SITE-A", NamespaceId = "sa-ns" }, + }, + Namespaces = new[] + { + new { NamespaceId = "main-ns", ClusterId = "MAIN", Kind = 1 }, + new { NamespaceId = "sa-ns", ClusterId = "SITE-A", Kind = 1 }, + }, + Tags = new[] + { + new { TagId = "t-main", DriverInstanceId = "main-galaxy", EquipmentId = (string?)null, Name = "M1", FolderPath = "F", DataType = "Boolean", TagConfig = "{}" }, + new { TagId = "t-sa", DriverInstanceId = "sa-galaxy", EquipmentId = (string?)null, Name = "S1", FolderPath = "F", DataType = "Boolean", TagConfig = "{}" }, + }, +}; + +[Fact] +public void ParseComposition_scoped_keeps_only_my_clusters_drivers_and_tags() +{ + var blob = BlobOf(MultiClusterSnapshotWithTags()); + + var main = DeploymentArtifact.ParseComposition(blob, "central-1:4053"); + main.DriverInstancePlans.Select(d => d.DriverInstanceId).ShouldBe(new[] { "main-galaxy" }); + main.GalaxyTags.Select(t => t.TagId).ShouldBe(new[] { "t-main" }); + + var siteA = DeploymentArtifact.ParseComposition(blob, "site-a-1:4053"); + siteA.DriverInstancePlans.Select(d => d.DriverInstanceId).ShouldBe(new[] { "sa-galaxy" }); + siteA.GalaxyTags.Select(t => t.TagId).ShouldBe(new[] { "t-sa" }); +} + +[Fact] +public void ParseComposition_scoped_unknown_node_is_empty() +{ + var comp = DeploymentArtifact.ParseComposition(BlobOf(MultiClusterSnapshotWithTags()), "ghost-9:4053"); + comp.GalaxyTags.ShouldBeEmpty(); + comp.DriverInstancePlans.ShouldBeEmpty(); +} + +[Fact] +public void ParseComposition_single_cluster_node_id_overload_matches_legacy() +{ + var blob = BlobOf(new + { + Clusters = new[] { new { ClusterId = "MAIN" } }, + Nodes = new[] { new { NodeId = "n1:4053", ClusterId = "MAIN" } }, + DriverInstances = new[] { new { DriverInstanceId = "d1", DriverType = "Modbus", DriverConfig = "{}", ClusterId = "MAIN", NamespaceId = "ns" } }, + }); + DeploymentArtifact.ParseComposition(blob, "anything:4053").DriverInstancePlans.Count + .ShouldBe(DeploymentArtifact.ParseComposition(blob).DriverInstancePlans.Count); +} +``` + +**Step 2: Run — verify FAIL** (2-arg `ParseComposition` missing): +`dotnet test tests/Server/ZB.MOM.WW.OtOpcUa.Runtime.Tests --filter "FullyQualifiedName~DeploymentArtifactTests"` + +**Step 3: Implement** + +Add to `DeploymentArtifact.cs` (after the no-arg `ParseComposition`): + +```csharp +/// Cluster-scoped overload: the address-space composition a node should materialise given +/// its NodeId. Filters every projection to the node's own ClusterId (see ). +/// The deployment artifact blob. +/// This node's identity in "host:port" form. +/// The filtered composition per the node's scoping decision. +public static Phase7CompositionResult ParseComposition(ReadOnlySpan blob, string nodeId) +{ + var scope = ResolveClusterScope(blob, nodeId); + if (scope.Mode == ClusterFilterMode.None) return ParseComposition(blob); + if (scope.Mode == ClusterFilterMode.Suppress) return Empty(); + + var full = ParseComposition(blob); + var sets = BuildClusterSets(blob, scope.ClusterId!); + return new Phase7CompositionResult( + full.UnsAreas.Where(a => sets.AreaIds.Contains(a.UnsAreaId)).ToArray(), + full.UnsLines.Where(l => sets.AreaIds.Contains(l.UnsAreaId)).ToArray(), + full.EquipmentNodes.Where(e => sets.EquipmentIds.Contains(e.EquipmentId)).ToArray(), + full.DriverInstancePlans.Where(d => sets.DriverIds.Contains(d.DriverInstanceId)).ToArray(), + full.ScriptedAlarmPlans.Where(a => sets.EquipmentIds.Contains(a.EquipmentId)).ToArray(), + full.GalaxyTags.Where(t => sets.DriverIds.Contains(t.DriverInstanceId)).ToArray()) + { + EquipmentTags = full.EquipmentTags.Where(t => sets.DriverIds.Contains(t.DriverInstanceId)).ToArray(), + }; +} + +private sealed record ClusterSets(HashSet DriverIds, HashSet AreaIds, HashSet EquipmentIds); + +/// Build the in-cluster id sets used to filter a composition: DriverInstanceIds + UnsAreaIds +/// that directly carry the ClusterId, plus EquipmentIds whose DriverInstanceId is in-cluster. +private static ClusterSets BuildClusterSets(ReadOnlySpan blob, string clusterId) +{ + var driverIds = new HashSet(StringComparer.Ordinal); + var areaIds = new HashSet(StringComparer.Ordinal); + var equipmentIds = new HashSet(StringComparer.Ordinal); + try + { + using var doc = JsonDocument.Parse(blob.ToArray()); + var root = doc.RootElement; + CollectIdsWhereCluster(root, "DriverInstances", "DriverInstanceId", clusterId, driverIds); + CollectIdsWhereCluster(root, "UnsAreas", "UnsAreaId", clusterId, areaIds); + // Equipment carries no ClusterId — include it when its DriverInstanceId is in-cluster. + if (root.TryGetProperty("Equipment", out var eq) && eq.ValueKind == JsonValueKind.Array) + { + foreach (var el in eq.EnumerateArray()) + { + if (el.ValueKind != JsonValueKind.Object) continue; + var di = el.TryGetProperty("DriverInstanceId", out var diEl) ? diEl.GetString() : null; + var id = el.TryGetProperty("EquipmentId", out var idEl) ? idEl.GetString() : null; + if (!string.IsNullOrWhiteSpace(id) && di is not null && driverIds.Contains(di)) + equipmentIds.Add(id!); + } + } + } + catch (JsonException) { /* empty sets ⇒ nothing matches ⇒ empty composition */ } + return new ClusterSets(driverIds, areaIds, equipmentIds); +} + +private static void CollectIdsWhereCluster( + JsonElement root, string arrayName, string idField, string clusterId, HashSet into) +{ + if (!root.TryGetProperty(arrayName, out var arr) || arr.ValueKind != JsonValueKind.Array) return; + foreach (var el in arr.EnumerateArray()) + { + if (el.ValueKind != JsonValueKind.Object) continue; + var cid = el.TryGetProperty("ClusterId", out var cEl) ? cEl.GetString() : null; + if (!string.Equals(cid, clusterId, StringComparison.Ordinal)) continue; + var id = el.TryGetProperty(idField, out var idEl) ? idEl.GetString() : null; + if (!string.IsNullOrWhiteSpace(id)) into.Add(id!); + } +} +``` + +Note: equipment is filtered via its `DriverInstanceId` (schema-guaranteed present +for equipment-namespace rows). If a future schema allows equipment with a null +`DriverInstanceId`, extend `BuildClusterSets` to also include equipment whose +`UnsLineId` maps to an in-cluster `UnsArea` — out of scope here (the dev rig's +sites are empty). + +**Step 4: Run — verify PASS** (new + pre-existing tests): +`dotnet test tests/Server/ZB.MOM.WW.OtOpcUa.Runtime.Tests --filter "FullyQualifiedName~DeploymentArtifactTests"` + +**Step 5: Commit** + +```bash +git add src/Server/ZB.MOM.WW.OtOpcUa.Runtime/Drivers/DeploymentArtifact.cs \ + tests/Server/ZB.MOM.WW.OtOpcUa.Runtime.Tests/Drivers/DeploymentArtifactTests.cs +git commit -m "feat(runtime): node-scoped ParseComposition filters address space by ClusterId" +``` + +**Acceptance:** scoped `ParseComposition` excludes cross-cluster projections; +single-cluster + unknown-node behavior matches the rule; no-arg overload untouched. + +--- + +## Task 3: Wire driver-spawn + SubscribeBulk filtering into `DriverHostActor` + +**Classification:** high-risk +**Estimated implement time:** ~5 min +**Parallelizable with:** Task 4 + +**Blocked by:** Task 1, Task 2. + +**Files:** +- Modify: `src/Server/ZB.MOM.WW.OtOpcUa.Runtime/Drivers/DriverHostActor.cs:367` (ReconcileDrivers) and `:432` (PushDesiredSubscriptions) +- Test: `tests/Server/ZB.MOM.WW.OtOpcUa.Runtime.Tests/Drivers/` (add `DriverHostActorClusterScopeTests.cs`, or extend an existing DriverHostActor test if present) + +**Context:** `ReconcileDrivers` (line 349) loads the artifact blob then calls +`ParseDriverInstances(blob)`. `PushDesiredSubscriptions` (line 412) calls +`ParseComposition(blob)`. Both run for normal applies (`ApplyAndAck`, line 311/321) +**and** restart restore (`RestoreApplied`, line 393/395) — so changing these two +call sites covers both paths. The ack (`SendAck`, line 314) fires unconditionally +*before* the rebuild, so an empty/suppressed slice still acks — no ack change needed. + +**Step 1: Write the failing test** + +A TestKit test that a driver node in a multi-cluster artifact spawns only its +cluster's drivers, and a node whose cluster has no drivers still reaches Applied +(acks). Model it on the existing DriverHostActor tests in the same folder (reuse +their in-memory DbContext + DispatchDeployment plumbing — inspect a sibling test +for the exact harness helpers). Core assertions: + +```csharp +// Given a sealed deployment whose artifact has 2 clusters (MAIN: 1 driver, SITE-A: 1 driver) +// and a DriverHostActor whose _localNode = "site-a-1:4053": +// - after DispatchDeployment, GetDiagnostics shows ONLY the SITE-A driver (not MAIN's) +// - the node sends an Applied ApplyAck (convergence holds even though it ignored MAIN's driver) +// And a second actor with _localNode = "central-1:4053" shows ONLY the MAIN driver. +``` + +**Step 2: Run — verify FAIL** (node currently spawns both clusters' drivers): +`dotnet test tests/Server/ZB.MOM.WW.OtOpcUa.Runtime.Tests --filter "FullyQualifiedName~DriverHostActorClusterScope"` + +**Step 3: Implement** — two one-line call-site changes: + +`DriverHostActor.cs:367`: +```csharp +// before: +var specs = DeploymentArtifact.ParseDriverInstances(blob); +// after: +var specs = DeploymentArtifact.ParseDriverInstances(blob, _localNode.Value); +``` + +`DriverHostActor.cs:432`: +```csharp +// before: +composition = DeploymentArtifact.ParseComposition(blob); +// after: +composition = DeploymentArtifact.ParseComposition(blob, _localNode.Value); +``` + +**Step 4: Run — verify PASS, then the whole Runtime suite for no regression:** +```bash +dotnet test tests/Server/ZB.MOM.WW.OtOpcUa.Runtime.Tests +``` +Expected: new test green; all pre-existing tests green (single-cluster harnesses +hit the `None` branch → unchanged). + +**Step 5: Commit** + +```bash +git add src/Server/ZB.MOM.WW.OtOpcUa.Runtime/Drivers/DriverHostActor.cs \ + tests/Server/ZB.MOM.WW.OtOpcUa.Runtime.Tests/Drivers/DriverHostActorClusterScopeTests.cs +git commit -m "feat(runtime): DriverHost spawns + subscribes only its own ClusterId's drivers" +``` + +**Acceptance:** a site node spawns only its cluster's drivers and still acks +Applied with an empty slice; existing single-cluster tests stay green. + +--- + +## Task 4: Wire scoped composition into `OpcUaPublishActor.HandleRebuild` + +**Classification:** high-risk +**Estimated implement time:** ~5 min +**Parallelizable with:** Task 3 + +**Blocked by:** Task 2. + +**Files:** +- Modify: `src/Server/ZB.MOM.WW.OtOpcUa.Runtime/OpcUa/OpcUaPublishActor.cs:212` (HandleRebuild) +- Test: `tests/Server/ZB.MOM.WW.OtOpcUa.Runtime.Tests/OpcUa/OpcUaPublishActorRebuildTests.cs` + +**Context:** `HandleRebuild` (line ~210) loads the artifact then calls +`ParseComposition(artifact)` and materialises via `Phase7Applier`. `_localNode` is +`NodeId?` (line 46) — null on legacy/dev callers, so guard for null. The existing +`OpcUaPublishActorRebuildTests` use a fake/inspectable sink — reuse that pattern. + +**Step 1: Write the failing test** + +```csharp +// Build a 2-cluster artifact (MAIN galaxy tag t-main; SITE-A galaxy tag t-sa), +// seal it as a Deployment row in the test DbContext, construct an OpcUaPublishActor +// with _localNode = NodeId.Parse("site-a-1:4053") and an inspectable sink, send +// RebuildAddressSpace(correlation, depId), then assert the sink received ONLY the +// SITE-A variable/folders (t-sa) and NOT the MAIN ones (t-main). +// Mirror with _localNode = "central-1:4053" → only MAIN. +``` + +**Step 2: Run — verify FAIL** (sink currently gets both clusters' nodes). + +**Step 3: Implement** — `OpcUaPublishActor.cs:212`: +```csharp +// before: +var composition = DeploymentArtifact.ParseComposition(artifact); +// after: scope to this node's ClusterId when we know our identity; legacy/dev callers (null +// _localNode) keep the unscoped behaviour. +var composition = _localNode is { } ln + ? DeploymentArtifact.ParseComposition(artifact, ln.Value) + : DeploymentArtifact.ParseComposition(artifact); +``` + +**Step 4: Run — verify PASS + full Runtime suite:** +```bash +dotnet test tests/Server/ZB.MOM.WW.OtOpcUa.Runtime.Tests +``` + +**Step 5: Commit** + +```bash +git add src/Server/ZB.MOM.WW.OtOpcUa.Runtime/OpcUa/OpcUaPublishActor.cs \ + tests/Server/ZB.MOM.WW.OtOpcUa.Runtime.Tests/OpcUa/OpcUaPublishActorRebuildTests.cs +git commit -m "feat(runtime): OPC UA rebuild materialises only the node's ClusterId slice" +``` + +**Acceptance:** a node materialises only its own cluster's address space; null +`_localNode` keeps legacy behavior; existing rebuild tests stay green. + +--- + +## Task 5: Multi-cluster scoping E2E on the cluster harness + +**Classification:** high-risk +**Estimated implement time:** ~5 min +**Parallelizable with:** none + +**Blocked by:** Task 3, Task 4. + +**Files:** +- Test: `tests/Server/ZB.MOM.WW.OtOpcUa.Host.IntegrationTests/` (add `MultiClusterScopingTests.cs`) +- Possibly Modify: `tests/Server/ZB.MOM.WW.OtOpcUa.Host.IntegrationTests/TwoNodeClusterHarness.cs` (only if a 2-ClusterId seed helper is needed) + +**Context:** `TwoNodeClusterHarness` boots an in-process 2-node cluster on an +in-memory DB with a null OPC UA sink. It proves the *deploy path* end-to-end +(compose → broadcast → apply → ack) but cannot assert a materialised tree (null +sink). So this test asserts **driver** scoping through the real path: seed two +`ServerCluster` rows + two `ClusterNode` rows (one per node, different `ClusterId`) ++ one `DriverInstance` per cluster, run one deployment, and assert via each node's +`GetDiagnostics` that each node hosts only its own cluster's driver. + +If the harness's seed helpers can't express two clusters cleanly, that's a plan +defect — surface it; the unit + actor tests (Tasks 1–4) already cover the scoping +logic, and Task 9 covers the live proof. + +**Step 1: Write the failing test** (per the context above). +**Step 2: Run — verify FAIL.** +**Step 3:** No production code — this test passes once Tasks 3+4 are in. If it +fails, the defect is in 3/4; fix there, not here. +**Step 4: Run — verify PASS:** +`dotnet test tests/Server/ZB.MOM.WW.OtOpcUa.Host.IntegrationTests --filter "FullyQualifiedName~MultiClusterScoping"` +**Step 5: Commit** +```bash +git add tests/Server/ZB.MOM.WW.OtOpcUa.Host.IntegrationTests/MultiClusterScopingTests.cs +# add TwoNodeClusterHarness.cs ONLY if you modified it +git commit -m "test(integration): multi-cluster deploy scopes drivers per node" +``` + +**Acceptance:** one deploy over a 2-cluster mesh leaves each node hosting only its +own cluster's driver. + +--- + +## Task 6: Rewrite `docker-dev/docker-compose.yml` for the single mesh + +**Classification:** standard +**Estimated implement time:** ~5 min +**Parallelizable with:** Task 1, Task 2, Task 7, Task 8 + +**Files:** +- Modify: `docker-dev/docker-compose.yml` + +**Context:** Today MAIN is 4 nodes (`admin-a`/`admin-b` admin + `driver-a`/`driver-b` +driver) as one mesh; SITE-A/SITE-B are 2-node fused meshes with their own seeds. +The anchor `&otopcua-host` (currently on `admin-a`) holds the shared build + env. + +**Steps:** + +1. Replace the four MAIN services with **two fused nodes**: + - `central-1` (becomes the `&otopcua-host` anchor): `OTOPCUA_ROLES: "admin,driver"`, + `ASPNETCORE_URLS: "http://+:9000"`, `Cluster__PublicHostname: "central-1"`, + `Cluster__SeedNodes__0: "akka.tcp://otopcua@central-1:4053"`, `Cluster__Roles__0: "admin"`, + `Cluster__Roles__1: "driver"`, keep all `Security__*` (Jwt/Ldap/DeployApiKey) + `GALAXY_MXGW_API_KEY` + (keep the existing `${GALAXY_MXGW_API_KEY:-mxgw_otopcua2_...}` default — do **not** introduce a new + hardcoded key), `ports: ["4840:4840"]`. + - `central-2`: same env, `Cluster__PublicHostname: "central-2"`, seed → `central-1`, + `ports: ["4841:4840"]`, `depends_on: { sql: healthy, central-1: started }`. +2. Convert the four site services to **driver-only**, all seeding `central-1`: + - For each of `site-a-1`, `site-a-2`, `site-b-1`, `site-b-2`: `OTOPCUA_ROLES: "driver"`, + `Cluster__Roles__0: "driver"` (remove the `admin` role + `Cluster__Roles__1`), keep + `Cluster__PublicHostname` = own name, set `Cluster__SeedNodes__0: "akka.tcp://otopcua@central-1:4053"`, + **remove** `ASPNETCORE_URLS` + the `Security__Jwt__*` / `Security__Ldap__*` / `Security__DeployApiKey` + block (driver-only nodes serve no UI and authenticate no users), keep the `ConnectionStrings__ConfigDb` + + `GALAXY_MXGW_API_KEY` lines, keep their OPC UA ports (`4842`–`4845`), and add + `depends_on: { sql: healthy, central-1: started }`. +3. Update `traefik.depends_on` to `[central-1, central-2]` (drop the removed services). +4. Rewrite the header comment block (lines ~1–40) to describe the **single mesh, + hub-and-spoke** topology: one Akka mesh seeded by `central-1`; `central-1/2` + are `admin,driver` (the only UI + deploy singleton); `site-*` are `driver`-only + members scoped by `ClusterId`; central UI at `:9200` manages + deploys to all. + Keep the existing accurate notes (SQL persistence, mesh isolation note now + describes a single mesh, headless deploy via `:9200/api/deployments`). + +**Verify:** +```bash +docker compose -f docker-dev/docker-compose.yml config --quiet && echo "compose OK" +``` +Expected: `compose OK`. + +**Commit:** +```bash +git add docker-dev/docker-compose.yml +git commit -m "feat(docker-dev): single-mesh hub-and-spoke (central-1/2 + driver-only sites)" +``` + +**Acceptance:** `docker compose config` parses; central nodes fused with UI; site +nodes driver-only seeding central; no new hardcoded API key. + +--- + +## Task 7: Rewrite `docker-dev/traefik-dynamic.yml` (central-only route) + +**Classification:** small +**Estimated implement time:** ~3 min +**Parallelizable with:** Task 1, Task 2, Task 6, Task 8 + +**Files:** +- Modify: `docker-dev/traefik-dynamic.yml` + +**Steps:** +1. Keep the `otopcua-admin` router (`PathPrefix(`/`)`) + service, but point its + `loadBalancer.servers` at `http://central-1:9000` and `http://central-2:9000`. +2. Delete the `otopcua-site-a` and `otopcua-site-b` routers **and** their services + (driver-only sites serve no UI). Keep the sticky-cookie + `/health/active` + healthcheck on the surviving `otopcua-admin` service. +3. Update the file header comment to describe the single central UI route. + +**Verify:** covered by Task 6's `docker compose config` (Traefik file is mounted, +not parsed by compose) — sanity-check it's valid YAML by eye; the live bring-up in +Task 9 is the real check. + +**Commit:** +```bash +git add docker-dev/traefik-dynamic.yml +git commit -m "feat(docker-dev): Traefik routes only the central cluster UI" +``` + +**Acceptance:** one router → central-1/central-2; site routers/services removed. + +--- + +## Task 8: Rewrite `docker-dev/seed/seed-clusters.sql` (MAIN nodes → central-1/2) + +**Classification:** small +**Estimated implement time:** ~4 min +**Parallelizable with:** Task 1, Task 2, Task 6, Task 7 + +**Files:** +- Modify: `docker-dev/seed/seed-clusters.sql` + +**Context:** The seed inserts 3 `ServerCluster` rows + 6 `ClusterNode` rows. MAIN's +`ClusterNode` rows are currently `driver-a:4053` / `driver-b:4053`. Sites already +have **no** drivers/tags (only `ServerCluster` + `ClusterNode`), which matches the +"empty sites" decision — leave them empty. + +**Steps:** +1. Change MAIN's two `ClusterNode` inserts from `driver-a` / `driver-b` to + `central-1` / `central-2`: `NodeId` `central-1:4053` / `central-2:4053`, + `Host` `central-1` / `central-2`, `ApplicationUri` `urn:OtOpcUa:central-1` / + `urn:OtOpcUa:central-2`, keep `OpcUaPort 4840`, `ServiceLevelBase` 200/150. + Update the `IF NOT EXISTS ... WHERE NodeId = '...'` guards to the new ids. +2. Update the MAIN `ServerCluster` `Notes` to "central-1/central-2 fused + admin+driver — UI + deploy singleton + MAIN OPC UA publishers." +3. Update the SITE-A/SITE-B `ServerCluster` `Notes` to "2-node driver-only, + managed by the central cluster over the shared mesh (empty until configured)." +4. Update the file header comment block (the `ClusterNode` map at lines ~5–7) to + `central-1, central-2 → MAIN`. +5. Leave the Galaxy namespace/driver/tags (MAIN) and the LDAP→role mappings + unchanged. + +**Verify:** SQL isn't run locally on macOS (no SQL reachable); correctness is +confirmed by the live bring-up in Task 9 (the `cluster-seed` job runs it). Eyeball +that every changed `NodeId`/guard is consistent. + +**Commit:** +```bash +git add docker-dev/seed/seed-clusters.sql +git commit -m "feat(docker-dev): seed MAIN ClusterNodes as central-1/central-2" +``` + +**Acceptance:** MAIN `ClusterNode` rows are `central-1`/`central-2`; sites keep +their cluster + node rows with no drivers; notes/header updated. + +--- + +## Task 9: Live docker-dev verification + +**Classification:** standard (verification — no subagent review needed) +**Estimated implement time:** ~5 min (plus container build time) +**Parallelizable with:** none + +**Blocked by:** Task 3, Task 4, Task 5, Task 6, Task 7, Task 8. + +**Files:** none (operational). + +**Steps (run from repo root on this Mac; Docker is local):** +1. Sync deployment + rebuild the image and bring the rig up: + ```bash + docker compose -f docker-dev/docker-compose.yml down + docker compose -f docker-dev/docker-compose.yml up -d --build + ``` +2. Confirm the mesh formed and the seed ran: + ```bash + docker compose -f docker-dev/docker-compose.yml ps + docker compose -f docker-dev/docker-compose.yml logs cluster-seed --tail=40 + ``` + Expect 3 `ServerCluster` rows + 6 `ClusterNode` rows (`central-1/2`, `site-a-1/2`, `site-b-1/2`). +3. Trigger a global deploy headlessly (no UI login needed — the deploy API is on + the central admin nodes): + ```bash + curl -s -X POST http://localhost:9200/api/deployments \ + -H "X-Api-Key: docker-dev-deploy-key" -H "Content-Type: application/json" \ + -d '{"createdBy":"per-cluster-verify"}' + ``` + Expect `202` + a `deploymentId`. +4. Confirm scoping in the driver logs — central applies the Galaxy driver, sites apply empty: + ```bash + docker compose -f docker-dev/docker-compose.yml logs central-1 | grep -iE "applied deployment|galaxy|materialis" + docker compose -f docker-dev/docker-compose.yml logs site-a-1 | grep -iE "applied deployment|galaxy|materialis" + ``` + Expect `central-1` to materialise the MAIN Galaxy tags; `site-a-1` to apply with **no** Galaxy/MAIN nodes. +5. Browse-check with the Client CLI: + ```bash + dotnet run --project src/Client/ZB.MOM.WW.OtOpcUa.Client.CLI -- browse -u opc.tcp://localhost:4840 -r -d 4 # central: Galaxy tree + dotnet run --project src/Client/ZB.MOM.WW.OtOpcUa.Client.CLI -- browse -u opc.tcp://localhost:4842 -r -d 4 # site-a: empty (no MAIN tree) + ``` + Expect the MAIN Galaxy hierarchy on `:4840` and an **empty** address space on `:4842` (NOT the merged tree). + +**Acceptance:** the rig boots as one mesh; a single deploy populates the central +OPC UA tree and leaves the site nodes empty (proving per-ClusterId scoping live). +If anything regresses, stop and debug (do not paper over). + +--- + +## Task 10: Update docker-dev docs + memory + +**Classification:** small +**Estimated implement time:** ~4 min +**Parallelizable with:** Task 9 + +**Blocked by:** Task 6 (final topology). + +**Files:** +- Modify: `CLAUDE.md` (the Docker Workflow / docker-dev references that mention the cluster layout) +- Modify: `docs/v2/dev-environment.md` (if it describes the three-isolated-mesh topology) +- Modify: `/Users/dohertj2/.claude/projects/-Users-dohertj2-Desktop-OtOpcUa/memory/project_dev_environment.md` + `MEMORY.md` pointer + +**Steps:** +1. In `CLAUDE.md` and `docs/v2/dev-environment.md`, update any description of the + docker-dev topology from "three isolated meshes / MAIN admin-a+admin-b / + site fused" to "single mesh, hub-and-spoke: `central-1`/`central-2` fused + admin+driver own the only UI + deploy singleton; `site-*` are driver-only + members scoped by `ClusterId`; central UI at `:9200` deploys to all." Update + the OPC UA endpoint list (`central-1` `:4840`, `central-2` `:4841`, sites + `:4842`–`:4845`). +2. Update the `project_dev_environment.md` memory's docker-dev section to match + (node names, hub-and-spoke, "central UI deploys to all clusters"). Keep the + one-line `MEMORY.md` pointer accurate. + +**Commit:** +```bash +git add CLAUDE.md docs/v2/dev-environment.md +git commit -m "docs(docker-dev): document single-mesh hub-and-spoke topology" +``` +(The memory files live outside the repo — write them with the memory workflow, not git.) + +**Acceptance:** docs + memory describe the new topology accurately. + +--- + +## Done criteria + +- `dotnet build ZB.MOM.WW.OtOpcUa.slnx` clean. +- `dotnet test tests/Server/ZB.MOM.WW.OtOpcUa.Runtime.Tests` green (new scoping + tests + no regressions) and `...Host.IntegrationTests` multi-cluster test green. +- The full pre-existing suite stays green (the `ClusterCount ≤ 1` lenient branch + guarantees single-cluster behavior is unchanged). +- `docker compose config` parses; the live rig (Task 9) shows central serving the + Galaxy tree and sites empty under one global deploy. + +## Out of scope (follow-ups) + +- **Per-cluster deploy** (deploy just SITE-A from the UI) — coordinator + UI work. +- **Seeding demo drivers on the sites** — added via the central UI. diff --git a/docs/plans/2026-06-07-per-cluster-scoping.md.tasks.json b/docs/plans/2026-06-07-per-cluster-scoping.md.tasks.json new file mode 100644 index 00000000..aef782bc --- /dev/null +++ b/docs/plans/2026-06-07-per-cluster-scoping.md.tasks.json @@ -0,0 +1,16 @@ +{ + "planPath": "docs/plans/2026-06-07-per-cluster-scoping.md", + "tasks": [ + {"id": 1, "subject": "Task 1: ResolveClusterScope + node-scoped ParseDriverInstances", "status": "pending"}, + {"id": 2, "subject": "Task 2: Node-scoped ParseComposition (address-space filter)", "status": "pending", "blockedBy": [1]}, + {"id": 3, "subject": "Task 3: Wire driver-spawn + SubscribeBulk filtering into DriverHostActor", "status": "pending", "blockedBy": [1, 2]}, + {"id": 4, "subject": "Task 4: Wire scoped composition into OpcUaPublishActor.HandleRebuild", "status": "pending", "blockedBy": [2]}, + {"id": 5, "subject": "Task 5: Multi-cluster scoping E2E on the cluster harness", "status": "pending", "blockedBy": [3, 4]}, + {"id": 6, "subject": "Task 6: Rewrite docker-dev/docker-compose.yml for the single mesh", "status": "pending"}, + {"id": 7, "subject": "Task 7: Rewrite docker-dev/traefik-dynamic.yml (central-only route)", "status": "pending"}, + {"id": 8, "subject": "Task 8: Rewrite docker-dev/seed/seed-clusters.sql (MAIN nodes -> central-1/2)", "status": "pending"}, + {"id": 9, "subject": "Task 9: Live docker-dev verification", "status": "pending", "blockedBy": [3, 4, 5, 6, 7, 8]}, + {"id": 10, "subject": "Task 10: Update docker-dev docs + memory", "status": "pending", "blockedBy": [6]} + ], + "lastUpdated": "2026-06-07" +}