diff --git a/code-reviews/OpcUaServer/findings.md b/code-reviews/OpcUaServer/findings.md index 0317fd76..1caf9fc0 100644 --- a/code-reviews/OpcUaServer/findings.md +++ b/code-reviews/OpcUaServer/findings.md @@ -16,7 +16,7 @@ a category produced nothing rather than leaving it blank. | # | Category | Result | |---|---|---| -| 1 | Correctness & logic bugs | OpcUaServer-001, -002, -003 | +| 1 | Correctness & logic bugs | OpcUaServer-001, -002, -003, -007 | | 2 | OtOpcUa conventions | No issues found | | 3 | Concurrency & thread safety | No issues found (Lock discipline + fire-and-forget dispatch verified correct) | | 4 | Error handling & resilience | OpcUaServer-004 | @@ -209,3 +209,45 @@ claim and the retired `EquipmentNodeWalker`/F14b framing), and the two `OpcUaApp doc blocks (class summary + `BuildUserTokenPolicies`) to describe the shipped impersonation/auth wiring instead of the "F13/F13c pending" framing. Doc-comment-only — no behaviour change; build re-verified green. + +### OpcUaServer-007 + +| Field | Value | +|---|---| +| Severity | Medium | +| Category | Correctness & logic bugs | +| Location | `OtOpcUaNodeManager.cs:674` (`MaterialiseAlarmCondition` → `alarm.OnTimedUnshelve`) | +| Status | Resolved | + +**Description:** The system-timer auto-unshelve callback (`alarm.OnTimedUnshelve`, wired in +`MaterialiseAlarmCondition`) — fired by the SDK when a `TimedShelve` duration expires — built its +`AlarmCommand` with `User: string.Empty`: +`AlarmCommandRouter?.Invoke(new AlarmCommand(alarmId, "Unshelve", string.Empty, null, null))`. That +command is routed to the scripted-alarm engine, whose `ScriptedAlarmEngine.UnshelveAsync` calls +`Part9StateMachine.ApplyUnshelve(cur, user, _clock())` +(`src/Core/ZB.MOM.WW.OtOpcUa.Core.ScriptedAlarms/Part9StateMachine.cs:211-213`), and that method +opens with `if (string.IsNullOrWhiteSpace(user)) throw new ArgumentException("User required.", …)`. +The exception is swallowed downstream (caught by `ScriptedAlarmHostActor`), so this is NOT a crash — +but the auto-unshelve **silently no-ops**: a `TimedShelve` never auto-expires, leaving the alarm +permanently shelved with no operator-visible error. The bug was operationally invisible and even had +a green node-manager test that asserted `cmd.User == string.Empty`, encoding the defect as expected +behaviour. The separate, correct design rule — `OnTimedUnshelve` BYPASSES the `AlarmAck` role gate +because it is a session-less system timer with no client principal (routing through the gated +`HandleAlarmCommand` would return `BadUserAccessDenied`) — is intentional and was preserved; only the +empty `User` string was the defect. + +**Recommendation:** Pass a non-empty system user at the `OnTimedUnshelve` call site so `ApplyUnshelve` +accepts it, while keeping the gate bypass. Use the codebase-wide canonical `"system"` system-actor +label (matching `Part9StateMachine`'s own `AutoUnshelve` audit user, `AlarmConditionState`'s +"engine-internal events ⇒ `system`" doc, and `AuditActor.SystemFallback = "system"`). + +**Resolution:** Resolved — 2026-06-19 (SHA pending): changed the `OnTimedUnshelve` delegate in +`MaterialiseAlarmCondition` (`OtOpcUaNodeManager.cs:674`) to build the `AlarmCommand` with +`User: "system"` instead of `string.Empty`, so the engine's `ApplyUnshelve` user-required guard +accepts the system-initiated unshelve and the timed auto-unshelve actually applies. The `AlarmAck` +gate bypass for the session-less system timer is unchanged. TDD: repurposed the existing +`AlarmCommandRouterTests.OnTimedUnshelve_with_system_context_returns_good_and_routes_unshelve` +(which previously asserted `cmd.User == string.Empty`) to assert a non-empty `"system"` user — +verified it failed against the unfixed source (`cmd.User should not be null or white space`) and +passes after the fix. Surgical one-line src change; no public-contract/wire change. Full +`OpcUaServer.Tests` suite green (275/275). diff --git a/src/Server/ZB.MOM.WW.OtOpcUa.OpcUaServer/OtOpcUaNodeManager.cs b/src/Server/ZB.MOM.WW.OtOpcUa.OpcUaServer/OtOpcUaNodeManager.cs index 8b591777..394a6ba5 100644 --- a/src/Server/ZB.MOM.WW.OtOpcUa.OpcUaServer/OtOpcUaNodeManager.cs +++ b/src/Server/ZB.MOM.WW.OtOpcUa.OpcUaServer/OtOpcUaNodeManager.cs @@ -674,7 +674,14 @@ public sealed class OtOpcUaNodeManager : CustomNodeManager2 alarm.OnTimedUnshelve = (context, condition) => { var alarmId = condition.NodeId.Identifier?.ToString() ?? string.Empty; - AlarmCommandRouter?.Invoke(new AlarmCommand(alarmId, "Unshelve", string.Empty, null, null)); + // User MUST be non-empty: the engine's Part9StateMachine.ApplyUnshelve rejects a + // null/whitespace user (ArgumentException "User required."), and that exception is swallowed + // downstream — an empty user would make the timed auto-unshelve silently no-op, so a + // TimedShelve would never auto-expire. There is no client principal on a system-timer + // unshelve, so label it the canonical engine-internal "system" user (matching the engine's + // own AutoUnshelve audit user and the codebase-wide "system" system-actor convention). The + // AlarmAck gate bypass is intentional and preserved — this is a session-less SDK timer. + AlarmCommandRouter?.Invoke(new AlarmCommand(alarmId, "Unshelve", "system", null, null)); return ServiceResult.Good; }; diff --git a/tests/Server/ZB.MOM.WW.OtOpcUa.OpcUaServer.Tests/AlarmCommandRouterTests.cs b/tests/Server/ZB.MOM.WW.OtOpcUa.OpcUaServer.Tests/AlarmCommandRouterTests.cs index cf683ac4..e5f9f928 100644 --- a/tests/Server/ZB.MOM.WW.OtOpcUa.OpcUaServer.Tests/AlarmCommandRouterTests.cs +++ b/tests/Server/ZB.MOM.WW.OtOpcUa.OpcUaServer.Tests/AlarmCommandRouterTests.cs @@ -270,8 +270,15 @@ public sealed class AlarmCommandRouterTests : IDisposable /// OnTimedUnshelve fires with the SDK's system context (no session, no user identity) — /// the real SDK path when a TimedShelve duration expires. The gate must NOT veto: the result must be - /// Good, the router must be invoked exactly once with Operation == "Unshelve" and User == empty, - /// and no UnshelveAtUtc is carried. + /// Good, the router must be invoked exactly once with Operation == "Unshelve", and no UnshelveAtUtc is + /// carried. + /// + /// Regression (OpcUaServer-007): the routed command's must be a + /// NON-EMPTY system user ("system"). The engine's Part9StateMachine.ApplyUnshelve rejects + /// a null/whitespace user with ArgumentException("User required."); that exception is swallowed + /// downstream, so an empty user made the timed auto-unshelve SILENTLY no-op — a TimedShelve would never + /// auto-expire. Asserting a non-empty user here pins the value the state machine requires. + /// [Fact] public async Task OnTimedUnshelve_with_system_context_returns_good_and_routes_unshelve() { @@ -296,7 +303,9 @@ public sealed class AlarmCommandRouterTests : IDisposable var cmd = captured[0]; cmd.AlarmId.ShouldBe("alm-tu"); // == ScriptedAlarmId / condition NodeId identifier cmd.Operation.ShouldBe("Unshelve"); - cmd.User.ShouldBe(string.Empty); // no client principal — system-initiated + // System-initiated (no client principal), but the user MUST be non-empty so ApplyUnshelve accepts it. + cmd.User.ShouldNotBeNullOrWhiteSpace(); + cmd.User.ShouldBe("system"); cmd.UnshelveAtUtc.ShouldBeNull(); await host.DisposeAsync();