From 1aa7905676e4ba5d4b15c1b0ca2e04fb23bbfbc0 Mon Sep 17 00:00:00 2001 From: Joseph Doherty Date: Fri, 19 Jun 2026 10:52:22 -0400 Subject: [PATCH] review(ControlPlane): fix premature deploy-seal from unexpected-node ack Review at HEAD 7286d320. ControlPlane-001 (Medium): ConfigPublishCoordinator.HandleAck now discards acks from nodes not in _expectedAcks (prevented premature SealDeployment) + regression test. -002 (flipped-node log count), -003 (redundant mapper arms) tidied. --- code-reviews/ControlPlane/findings.md | 109 ++++++++++++++++++ .../Audit/AuditOutcomeMapper.cs | 14 +-- .../Coordinators/ConfigPublishCoordinator.cs | 10 ++ .../Fleet/FleetStatusBroadcaster.cs | 6 +- .../ConfigPublishCoordinatorTests.cs | 55 +++++++++ 5 files changed, 181 insertions(+), 13 deletions(-) create mode 100644 code-reviews/ControlPlane/findings.md diff --git a/code-reviews/ControlPlane/findings.md b/code-reviews/ControlPlane/findings.md new file mode 100644 index 00000000..a9d6ff6d --- /dev/null +++ b/code-reviews/ControlPlane/findings.md @@ -0,0 +1,109 @@ +# Code Review — ControlPlane + + + +| Field | Value | +|---|---| +| Module | `src/Server/ZB.MOM.WW.OtOpcUa.ControlPlane` | +| Reviewer | Claude Code | +| Review date | 2026-06-19 | +| Commit reviewed | `7286d320` | +| Status | Reviewed | +| Open findings | 0 | + +## Checklist coverage + +A comprehensive review completes every category, recording "No issues found" where +a category produced nothing rather than leaving it blank. + +| # | Category | Result | +|---|---|---| +| 1 | Correctness & logic bugs | ControlPlane-001 (Resolved) | +| 2 | OtOpcUa conventions | No issues found | +| 3 | Concurrency & thread safety | No issues found | +| 4 | Error handling & resilience | No issues found | +| 5 | Security | No issues found | +| 6 | Performance & resource management | No issues found | +| 7 | Design-document adherence | No issues found | +| 8 | Code organization & conventions | ControlPlane-002 (Resolved), ControlPlane-003 (Resolved) | +| 9 | Testing coverage | No issues found | +| 10 | Documentation & comments | No issues found | + +## Findings + + + +### ControlPlane-001 + +| Field | Value | +|---|---| +| Severity | Medium | +| Category | Correctness & logic bugs | +| Location | `src/Server/ZB.MOM.WW.OtOpcUa.ControlPlane/Coordinators/ConfigPublishCoordinator.cs:154` | +| Status | Resolved | + +**Description:** `HandleAck` accepts `ApplyAck` messages from any node, including nodes that were +not in `_expectedAcks` at dispatch time. When an unexpected node's ack is added to `_acks`, +`_acks.Count` can reach `_expectedAcks.Count` before every expected node has actually responded. +The count-based gate on line 166 (`if (_acks.Count < _expectedAcks.Count) return;`) then fires +prematurely, causing `SealDeployment()` to run while one or more expected nodes have not yet +applied the deployment. In production, unexpected acks are unlikely (only `DriverHostActor` +publishes to the `deployment-acks` topic) but the guard is missing. + +**Recommendation:** Add a membership check at the start of the processing path in `HandleAck` +to discard any ack whose `NodeId` is not in `_expectedAcks`, logging it at Debug level for +diagnostics. + +**Resolution:** Fixed 2026-06-19 (commit SHA pending). Added `if (!_expectedAcks.Contains(msg.NodeId))` guard at the top of the processing path in `HandleAck`, discarding unexpected-node acks before they are counted. Regression test `ApplyAck_from_unexpected_node_is_discarded_and_does_not_seal_prematurely` added to `ConfigPublishCoordinatorTests` (proven failing before fix, green after). + +--- + +### ControlPlane-002 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Code organization & conventions | +| Location | `src/Server/ZB.MOM.WW.OtOpcUa.ControlPlane/Fleet/FleetStatusBroadcaster.cs:139` | +| Status | Resolved | + +**Description:** `OnTick` uses a `bool changed` flag but passes it to the debug-log message +via `changed ? 1 : 0`. The `{Count}` placeholder in the message implies a numeric count of +flipped nodes, but the expression always yields `1` (never the true count) whenever any node +flips. Operators reading the log will see "flipped 1 nodes to Degraded" even when multiple +nodes flipped simultaneously during a cluster partition recovery. + +**Recommendation:** Replace the `bool changed` flag with an `int flippedCount` counter +incremented inside the loop, and log `flippedCount` directly. + +**Resolution:** Fixed 2026-06-19 (commit SHA pending). Replaced `bool changed` with `int flippedCount`, incremented per flipped node, and logged directly. All 60 tests pass. + +--- + +### ControlPlane-003 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Code organization & conventions | +| Location | `src/Server/ZB.MOM.WW.OtOpcUa.ControlPlane/Audit/AuditOutcomeMapper.cs:29` | +| Status | Resolved | + +**Description:** `FromAction` lists ten explicit `AuditOutcome.Success` arms that are all +shadowed by the `_ => AuditOutcome.Success` default. The explicit arms add no behavioural +differentiation and create an ongoing maintenance obligation: any new config-write verb added +to the codebase silently falls through to the correct default anyway, making the explicit list +misleading as a whitelist. The comment block documenting the known verbs is a better vehicle +for the enumeration than code. + +**Recommendation:** Remove the redundant explicit Success arms and document the known success +verbs in a comment on the default arm instead. The two Denied arms must stay. + +**Resolution:** Fixed 2026-06-19 (commit SHA pending). Removed the ten redundant explicit +Success arms; replaced with an explanatory comment listing the known success verbs on the +default `_ => AuditOutcome.Success` arm. All existing AuditOutcomeMapper tests still pass +(the `Outcome_is_derived_from_the_action_vocabulary` theory covers every listed verb). diff --git a/src/Server/ZB.MOM.WW.OtOpcUa.ControlPlane/Audit/AuditOutcomeMapper.cs b/src/Server/ZB.MOM.WW.OtOpcUa.ControlPlane/Audit/AuditOutcomeMapper.cs index c3e15072..fd4367be 100644 --- a/src/Server/ZB.MOM.WW.OtOpcUa.ControlPlane/Audit/AuditOutcomeMapper.cs +++ b/src/Server/ZB.MOM.WW.OtOpcUa.ControlPlane/Audit/AuditOutcomeMapper.cs @@ -29,16 +29,10 @@ public static class AuditOutcomeMapper public static AuditOutcome FromAction(string action) => action switch { "OpcUaAccessDenied" or "CrossClusterNamespaceAttempt" => AuditOutcome.Denied, - "DraftCreated" - or "DraftEdited" - or "Published" - or "RolledBack" - or "NodeApplied" - or "ClusterCreated" - or "NodeAdded" - or "CredentialAdded" - or "CredentialDisabled" - or "ExternalIdReleased" => AuditOutcome.Success, + // All other known config-write verbs (DraftCreated, DraftEdited, Published, RolledBack, + // NodeApplied, ClusterCreated, NodeAdded, CredentialAdded, CredentialDisabled, + // ExternalIdReleased) and any future verbs default to Success — config writes are the + // overwhelming majority and the only non-success cases are the two Denied entries above. _ => AuditOutcome.Success, }; } diff --git a/src/Server/ZB.MOM.WW.OtOpcUa.ControlPlane/Coordinators/ConfigPublishCoordinator.cs b/src/Server/ZB.MOM.WW.OtOpcUa.ControlPlane/Coordinators/ConfigPublishCoordinator.cs index c9c36104..79438ca6 100644 --- a/src/Server/ZB.MOM.WW.OtOpcUa.ControlPlane/Coordinators/ConfigPublishCoordinator.cs +++ b/src/Server/ZB.MOM.WW.OtOpcUa.ControlPlane/Coordinators/ConfigPublishCoordinator.cs @@ -160,6 +160,16 @@ public sealed class ConfigPublishCoordinator : ReceiveActor, IWithTimers return; } + // Discard acks from nodes that were not in the expected-ack set at dispatch time. + // Without this guard an unexpected-node ack inflates _acks.Count and can trigger a + // premature seal/fail before every expected node has actually applied the deployment. + if (!_expectedAcks.Contains(msg.NodeId)) + { + _log.Debug("Discarding ApplyAck from unexpected node {Node} for {Id} (not in expected set)", + msg.NodeId, msg.DeploymentId); + return; + } + _acks[msg.NodeId] = msg.Outcome; PersistNodeAck(msg); diff --git a/src/Server/ZB.MOM.WW.OtOpcUa.ControlPlane/Fleet/FleetStatusBroadcaster.cs b/src/Server/ZB.MOM.WW.OtOpcUa.ControlPlane/Fleet/FleetStatusBroadcaster.cs index 182305c4..1ca85343 100644 --- a/src/Server/ZB.MOM.WW.OtOpcUa.ControlPlane/Fleet/FleetStatusBroadcaster.cs +++ b/src/Server/ZB.MOM.WW.OtOpcUa.ControlPlane/Fleet/FleetStatusBroadcaster.cs @@ -127,16 +127,16 @@ public sealed class FleetStatusBroadcaster : ReceiveActor, IWithTimers private void OnTick() { var stale = DateTime.UtcNow - HeartbeatTimeout; - var changed = false; + var flippedCount = 0; foreach (var kv in _nodes.ToList()) { if (kv.Value.LastSeenUtc < stale && kv.Value.Health == FleetNodeHealth.Healthy) { _nodes[kv.Key] = kv.Value with { Health = FleetNodeHealth.Degraded }; - changed = true; + flippedCount++; } } - if (changed) _log.Debug("FleetStatusBroadcaster flipped {Count} nodes to Degraded", changed ? 1 : 0); + if (flippedCount > 0) _log.Debug("FleetStatusBroadcaster flipped {Count} nodes to Degraded", flippedCount); PublishSnapshot(); } diff --git a/tests/Server/ZB.MOM.WW.OtOpcUa.ControlPlane.Tests/ConfigPublishCoordinatorTests.cs b/tests/Server/ZB.MOM.WW.OtOpcUa.ControlPlane.Tests/ConfigPublishCoordinatorTests.cs index e32dcb29..fc16f0b5 100644 --- a/tests/Server/ZB.MOM.WW.OtOpcUa.ControlPlane.Tests/ConfigPublishCoordinatorTests.cs +++ b/tests/Server/ZB.MOM.WW.OtOpcUa.ControlPlane.Tests/ConfigPublishCoordinatorTests.cs @@ -60,6 +60,61 @@ public sealed class ConfigPublishCoordinatorTests : ControlPlaneActorTestBase db.Deployments.Single().Status.ShouldBe(DeploymentStatus.Sealed); } + /// + /// Regression guard for ControlPlane-001: an ApplyAck from a node that was NOT in the expected-ack + /// set (i.e. not a driver-role member when DispatchDeployment ran) must be discarded. + /// Without the fix, an unexpected-node ack inflates _acks.Count and can cause the + /// coordinator to seal a deployment before every expected node has responded. + /// + /// Scenario: dispatch with zero expected nodes seals immediately (baseline). A truly-unexpected + /// node later sends an ack for a fresh deployment that HAS one expected node — the ack from the + /// unexpected node must be ignored so the deployment waits for the real node. + /// + [Fact] + public void ApplyAck_from_unexpected_node_is_discarded_and_does_not_seal_prematurely() + { + var dbFactory = NewInMemoryDbFactory(); + var deploymentId = SeedDispatchingDeployment(dbFactory); + + // Seed a NodeDeploymentState row for "expected-driver" so the coordinator sees 1 expected ack. + using (var db = dbFactory.CreateDbContext()) + { + db.NodeDeploymentStates.Add(new Configuration.Entities.NodeDeploymentState + { + NodeId = "expected-driver", + DeploymentId = deploymentId.Value, + Status = Configuration.Enums.NodeDeploymentStatus.Applying, + }); + db.SaveChanges(); + } + + // Long deadline so time does not confound the test. + var actor = Sys.ActorOf(ConfigPublishCoordinator.Props(dbFactory, TimeSpan.FromMinutes(5))); + + // Drive the coordinator into the AwaitingApplyAcks state via DispatchDeployment. + // The coordinator seeds expected-acks from _cluster.State.Members (filtered to driver role). + // In this test harness the cluster has no driver-role members, so _expectedAcks is empty + // and the coordinator seals immediately on DispatchDeployment. + // + // To test the discard logic we use the PreStart recovery path instead: start the coordinator + // WITHOUT dispatching so it recovers the in-flight deployment from the DB (the NodeDeploymentState + // row seeds _expectedAcks = {"expected-driver"}), then send an ack from an UNEXPECTED node. + // If the bug is present the count check fires and seals; with the fix the ack is discarded. + + // Send ack from a rogue node NOT in _expectedAcks. + actor.Tell(new ApplyAck(deploymentId, NodeId.Parse("rogue-node"), + ApplyAckOutcome.Applied, null, CorrelationId.NewId())); + + // Give it time to process and potentially seal if buggy. + ExpectNoMsg(TimeSpan.FromMilliseconds(400)); + + using var db2 = dbFactory.CreateDbContext(); + var status = db2.Deployments.Single().Status; + // Deployment must still be AwaitingApplyAcks (or Dispatching from seed) — NOT Sealed or PartiallyFailed. + status.ShouldNotBe(DeploymentStatus.Sealed); + status.ShouldNotBe(DeploymentStatus.PartiallyFailed); + } + private static DeploymentId SeedDispatchingDeployment( Microsoft.EntityFrameworkCore.IDbContextFactory dbFactory) {