From bb59fd4e75d5dc37cc4a34a3f2a7d38881a7c235 Mon Sep 17 00:00:00 2001 From: Joseph Doherty Date: Mon, 15 Jun 2026 02:38:57 -0400 Subject: [PATCH] feat(opcua): surface failed inbound writes to clients (fail-fast, Bad blip, audit event) Three deferred 'surface the failed write' enhancements on the write-outcome self-correction path in OtOpcUaNodeManager: - Item A: synchronous structural fail-fast. EvaluateEquipmentWriteStructure (pure static) rejects a structurally-invalid write INLINE (Bad sync) after the authz gate but before the optimistic dispatch, so the SDK never applies it. Null payload -> BadTypeMismatch; plus a confidence-gated cheap built-in type compatibility check (numeric widening + BaseDataType wildcard tolerated; uncertain cases defer to the SDK's own coercion). - Item B: Bad-quality blip on device-write failure. On a revert, RevertOptimisticWriteIfNeeded first publishes the still-applied optimistic value with StatusCode BadDeviceFailure, then restores the prior value/status (both under the existing Lock). Documents the queue-coalescing caveat (a slow subscriber may see only the restored value -> the audit event is the reliable signal). - Item C: Part 8 AuditWriteUpdateEvent on device-write failure. Builds an AuditWriteUpdateEventState (SourceNode=node, AttributeId=Value, OldValue=prior, NewValue=attempted, ClientUserId from the threaded identity, Message carries outcome.Reason) under Lock and reports it via Server.ReportEvent OUTSIDE Lock. Guarded so auditing-disabled / report failure never breaks the revert. Threads the writing identity's user-id + node into the continuation. Adds 6 unit tests for EvaluateEquipmentWriteStructure. Build clean (0 warnings); 158/158 OpcUaServer.Tests green. --- .../OtOpcUaNodeManager.cs | 260 +++++++++++++++++- .../EquipmentWriteGateTests.cs | 72 +++++ 2 files changed, 317 insertions(+), 15 deletions(-) diff --git a/src/Server/ZB.MOM.WW.OtOpcUa.OpcUaServer/OtOpcUaNodeManager.cs b/src/Server/ZB.MOM.WW.OtOpcUa.OpcUaServer/OtOpcUaNodeManager.cs index 80d601a1..7cc28982 100644 --- a/src/Server/ZB.MOM.WW.OtOpcUa.OpcUaServer/OtOpcUaNodeManager.cs +++ b/src/Server/ZB.MOM.WW.OtOpcUa.OpcUaServer/OtOpcUaNodeManager.cs @@ -678,15 +678,24 @@ public sealed class OtOpcUaNodeManager : CustomNodeManager2 /// lets the SDK apply the written value optimistically. /// /// + /// Item A — synchronous structural fail-fast. After the authz gate passes but BEFORE the + /// optimistic dispatch, a pure pre-check rejects a + /// structurally-invalid write (e.g. a null payload, or a confidently-detected built-in-type + /// mismatch) INLINE — returning Bad synchronously so the SDK never applies it, avoiding the + /// optimistic-Good-then-revert round-trip + a pointless device dispatch. + /// + /// /// Write-outcome self-correction. Before returning Good (which makes the SDK overwrite the /// node with ) we capture both the optimistic value AND the node's REAL - /// prior value/status — at handler entry the node still holds the prior value. An off-Lock - /// continuation on the then reverts the node to that prior - /// value/status on a FAILED outcome, but ONLY while the node still holds the optimistic value, so a - /// fresh driver poll that already republished the confirmed register value is not clobbered - /// ( / ). On success the - /// optimistic value stands and the next poll re-confirms it via the normal - /// path. + /// prior value/status — at handler entry the node still holds the prior value — plus the writing + /// principal's user-id (threaded to the audit event). An off-Lock continuation on the + /// then, on a FAILED outcome and ONLY while the node still holds the + /// optimistic value (so a fresh driver poll that already republished the confirmed register value is + /// not clobbered): surfaces a transient Bad-quality blip (Item B), reverts the node to its + /// prior value/status, and raises a Part 8 AuditWriteUpdateEvent (Item C) recording the + /// rejected write ( / ). On + /// success the optimistic value stands and the next poll re-confirms it via the normal + /// path. /// /// private ServiceResult OnEquipmentTagWrite( @@ -698,6 +707,13 @@ public sealed class OtOpcUaNodeManager : CustomNodeManager2 var gate = EvaluateEquipmentWriteGate(identity, gateway is not NullOpcUaNodeWriteGateway); if (gate is not null) return gate; + // Item A (synchronous structural fail-fast): reject a structurally-invalid write INLINE — return Bad + // synchronously so the SDK never applies it (no optimistic-Good-then-revert round-trip + no needless + // device dispatch). Runs AFTER the authz gate (so we never leak structure detail to an unauthorised + // caller) but BEFORE the optimistic dispatch below. + var structure = EvaluateEquipmentWriteStructure(value, node); + if (structure is not null) return structure; + // Capture the optimistic value + the REAL prior value/status BEFORE the SDK applies the write // (at handler entry the node still holds the prior value; returning Good makes the SDK apply `value`). var optimisticValue = value; @@ -709,6 +725,11 @@ public sealed class OtOpcUaNodeManager : CustomNodeManager2 priorValue = variable.Value; priorStatus = variable.StatusCode; } + // Item C: thread the writing principal's user-id string into the failure continuation so the audit + // event can populate ClientUserId. Resolved here off the same identity the gate used (null when the + // session is anonymous / carries no role-carrying identity — the gate would already have vetoed, so in + // practice non-null on this path, but kept defensive). + var clientUserId = identity?.DisplayName; // Fire-and-forget — MUST NOT block under Lock. On a FAILED outcome, compare-and-revert (off-Lock // continuation). A faulted/cancelled WriteAsync is treated as a failure so the optimistic value never @@ -721,7 +742,7 @@ public sealed class OtOpcUaNodeManager : CustomNodeManager2 t => { var outcome = t.IsCompletedSuccessfully ? t.Result : new NodeWriteOutcome(false, "write dispatch faulted"); - RevertOptimisticWriteIfNeeded(nodeKey, outcome, optimisticValue, priorValue, priorStatus); + RevertOptimisticWriteIfNeeded(nodeKey, outcome, optimisticValue, priorValue, priorStatus, clientUserId); }, CancellationToken.None, TaskContinuationOptions.RunContinuationsAsynchronously, TaskScheduler.Default); @@ -762,6 +783,92 @@ public sealed class OtOpcUaNodeManager : CustomNodeManager2 return null; } + /// + /// Item A — synchronous structural fail-fast. Pure structural pre-check for an inbound + /// equipment-tag write, run AFTER passes but BEFORE the + /// optimistic device dispatch in . A structurally-invalid write is + /// rejected INLINE (a Bad is returned synchronously, so the SDK never + /// applies the value) instead of being optimistically applied and later reverted — saving both the + /// phantom value-blip and a pointless device round-trip. + /// + /// Interpretation / tradeoff (the item was deliberately under-specified). The minimum + /// sensible structural check is a null value write to a value variable ⇒ + /// BadTypeMismatch (a value node always holds a typed scalar/array; a null payload is never a + /// valid value write here). On top of that, when the node is a + /// whose resolves to a concrete built-in type AND the + /// written value's runtime built-in type is also resolvable, a CHEAP built-in-type compatibility + /// check is applied: a clear mismatch ⇒ BadTypeMismatch. The check is intentionally + /// conservative — it only rejects when BOTH the expected and actual built-in types are confidently + /// resolved AND differ (with numeric widening + the BaseDataType "accept anything" wildcard + /// allowed); anything uncertain (a non-variable node, an unresolved/abstract DataType, a + /// /array payload whose element type isn't cheaply known) is allowed through so + /// the SDK's own (authoritative) type coercion in BaseVariableState.WriteValue remains the + /// final arbiter. We deliberately do NOT attempt deep array-dimension / structured-type validation + /// here — that is left to the SDK. + /// + /// Pure (no SDK server / Lock needed): reads only and the node's static + /// DataType, so it is unit-testable in isolation. + /// + /// The value the client wrote (the SDK's pre-coercion payload). + /// The target node (expected to be a writable ). + /// null to proceed; otherwise the veto + /// (BadTypeMismatch for a null write or a confidently-detected built-in-type mismatch). + internal static ServiceResult? EvaluateEquipmentWriteStructure(object? value, NodeState node) + { + // Minimum sensible check: a null payload is never a valid value write to a value variable. + if (value is null) + { + return new ServiceResult(StatusCodes.BadTypeMismatch, "null value write rejected"); + } + + // Cheap, confidence-gated built-in-type compatibility check. Only when the node is a value variable + // with a concretely-resolvable expected built-in type AND the payload's built-in type is also cheaply + // resolvable do we compare; otherwise proceed and let the SDK's WriteValue coercion be authoritative. + if (node is not BaseDataVariableState variable) return null; + var expected = TypeInfo.GetBuiltInType(variable.DataType); // NodeId ⇒ built-in; abstract/unknown ⇒ Null + if (expected is BuiltInType.Null or BuiltInType.Variant or BuiltInType.DataValue) return null; // unresolved / wildcard + + // TypeInfo.Construct(object) classifies the runtime payload; an unclassifiable value ⇒ Unknown (Null). + var actual = TypeInfo.Construct(value).BuiltInType; + if (actual == BuiltInType.Null) return null; // couldn't classify the payload ⇒ defer to the SDK + + if (!IsBuiltInTypeCompatible(expected, actual)) + { + return new ServiceResult( + StatusCodes.BadTypeMismatch, + $"value built-in type {actual} is not compatible with node data type {expected}"); + } + + return null; + } + + /// + /// Conservative built-in-type compatibility test for . + /// Returns true (compatible — allow through) unless there is a confident mismatch. Exact matches pass; + /// numeric-to-numeric is treated as compatible (the SDK widens/narrows numerics); a + /// array nuance and any + /// on either side are treated as compatible. Only a clear cross-family + /// mismatch (e.g. writing a String to a Boolean node) returns false. Pure + static. + /// + private static bool IsBuiltInTypeCompatible(BuiltInType expected, BuiltInType actual) + { + if (expected == actual) return true; + // Any side a wildcard/unclassified ⇒ defer (compatible) — the caller already filtered most of these. + if (expected is BuiltInType.Variant or BuiltInType.Null) return true; + if (actual is BuiltInType.Variant or BuiltInType.Null) return true; + // Numeric family widening/narrowing is the SDK's job; treat numeric↔numeric as compatible. + if (IsNumeric(expected) && IsNumeric(actual)) return true; + // ByteString and Byte are routinely interchangeable on the wire; don't reject that pairing. + if ((expected, actual) is (BuiltInType.ByteString, BuiltInType.Byte) or (BuiltInType.Byte, BuiltInType.ByteString)) return true; + return false; + } + + /// True for the OPC UA numeric built-in types (the integer + floating families). + private static bool IsNumeric(BuiltInType t) => t is + BuiltInType.SByte or BuiltInType.Byte or BuiltInType.Int16 or BuiltInType.UInt16 or + BuiltInType.Int32 or BuiltInType.UInt32 or BuiltInType.Int64 or BuiltInType.UInt64 or + BuiltInType.Float or BuiltInType.Double; + /// /// Pure decision for the write-outcome self-correction: revert the node to its pre-write value ONLY on /// a FAILED outcome AND only while the node still holds the optimistic value. The @@ -780,28 +887,151 @@ public sealed class OtOpcUaNodeManager : CustomNodeManager2 /// /// Off-Lock continuation body for the write-outcome self-correction: re-takes Lock and, when - /// says so, reverts the node's Value + StatusCode to the captured pre-write - /// value/status and notifies subscribers (same node-update shape as ). A - /// no-op when the node was rebuilt/removed in the interim, when the outcome succeeded, or when a fresh - /// poll already moved the node off the optimistic value. Silent — this node manager carries no logger; - /// the gateway logs the underlying write failure. + /// says so, surfaces the device-write rejection to subscribed clients in three + /// ways, then leaves the node holding its captured pre-write value/status (same node-update shape as + /// ). A no-op when the node was rebuilt/removed in the interim, when the outcome + /// succeeded, or when a fresh poll already moved the node off the optimistic value. + /// + /// + /// Item B — Bad-quality blip. Before restoring the prior value, the node is published + /// once holding the (still-applied) optimistic value but with StatusCode + /// BadDeviceFailure, then published again with the prior value/status restored. A + /// value-subscribed client therefore sees the rejection (a Bad quality) rather than the value + /// silently snapping back. Caveat: the two ClearChangeMasks calls happen + /// back-to-back within one server publishing interval, so the SDK's monitored-item queue may + /// COALESCE them — a slow / queue-size-1 subscriber can see only the final restored value and + /// miss the transient Bad blip. The blip is a best-effort live signal; the + /// raised below (Item C) is the reliable, durable record + /// of the rejected write. + /// + /// + /// Item C — AuditWriteUpdateEvent. A Part 8 is + /// raised through the SDK so an auditing client gets a + /// durable record of the rejected write (OldValue = prior, NewValue = the attempted optimistic + /// value, the boolean AuditEvent Status = false ⇒ failed, and the device's + /// .Reason carried in the event Message). It is reported + /// OUTSIDE Lock (the event-state is built under Lock, then reported after release) to keep + /// the lock hold short and avoid any re-entrancy risk via the server's event path. Auditing being + /// disabled / no subscribers is handled gracefully — ReportEvent simply reaches no monitored + /// items, and any failure is swallowed (logged to the SDK trace) so the revert is never broken by it. + /// + /// + /// Silent value-wise — this node manager carries no logger; the gateway logs the underlying write failure + /// and the SDK trace captures any audit-report failure. /// /// The string id of the written variable node. /// The device-write outcome routed back by the gateway. /// The value the SDK optimistically applied on the write. /// The node's real value captured before the optimistic write. /// The node's real status captured before the optimistic write. + /// The writing principal's user-id string (the identity's DisplayName), threaded + /// from to populate the audit event's ClientUserId; null when unknown. private void RevertOptimisticWriteIfNeeded( - string nodeId, NodeWriteOutcome outcome, object? optimisticValue, object? priorValue, StatusCode priorStatus) + string nodeId, NodeWriteOutcome outcome, object? optimisticValue, object? priorValue, StatusCode priorStatus, + string? clientUserId) { + // Built under Lock if (and only if) a revert is performed, then reported AFTER Lock is released. + AuditWriteUpdateEventState? auditEvent = null; + lock (Lock) { if (!_variables.TryGetValue(nodeId, out var variable)) return; // rebuilt/removed ⇒ no-op if (!ShouldRevert(outcome, variable.Value, optimisticValue)) return; // success, or poll moved it on + + // Item B: surface a transient Bad-quality blip on the still-applied optimistic value, then restore + // the prior value/status. Both publishes are under this single Lock hold (mirrors WriteValue's + // node-update shape). See the method remarks for the queue-coalescing caveat. + variable.StatusCode = StatusCodes.BadDeviceFailure; + variable.Timestamp = DateTime.UtcNow; + variable.ClearChangeMasks(SystemContext, includeChildren: false); // notify — the Bad blip + variable.Value = priorValue; variable.StatusCode = priorStatus; variable.Timestamp = DateTime.UtcNow; - variable.ClearChangeMasks(SystemContext, includeChildren: false); + variable.ClearChangeMasks(SystemContext, includeChildren: false); // notify — restore prior + + // Item C: build the audit event-state while we hold the node reference + Lock, but DON'T report yet. + auditEvent = BuildWriteFailureAuditEvent(variable, outcome, optimisticValue, priorValue, clientUserId); + } + + // Report OUTSIDE Lock — keeps the hold short and sidesteps any re-entrancy through the server event path. + if (auditEvent is not null) ReportAuditEvent(auditEvent); + } + + /// + /// Item C — build (but do not report) a Part 8 recording a + /// rejected device write on . The caller holds Lock (the node is read + /// here); reporting happens after Lock release. The standard AuditEvent envelope (EventId, EventType, + /// Time/ReceiveTime, ServerId, Severity, Message, SourceNode/SourceName, Status, ActionTimeStamp) is + /// stamped by the SDK's helper; this method then fills the write-specific fields + /// (AttributeId = Value, IndexRange = empty, OldValue = prior, NewValue = attempted) plus ClientUserId. + /// + /// The written node (the audit event's SourceNode). + /// The failed device-write outcome (its Reason goes into the event Message + Status). + /// The value the client attempted (the audit NewValue). + /// The node's real pre-write value (the audit OldValue). + /// The writing principal's user-id string; null when unknown. + /// A populated, unreported . + private AuditWriteUpdateEventState BuildWriteFailureAuditEvent( + BaseDataVariableState variable, NodeWriteOutcome outcome, object? optimisticValue, object? priorValue, + string? clientUserId) + { + var reason = string.IsNullOrEmpty(outcome.Reason) ? "device write rejected" : outcome.Reason!; + + var audit = new AuditWriteUpdateEventState(null); + // Initialize stamps EventId (fresh GUID bytes), EventType, Time/ReceiveTime, ServerId, Severity, Message, + // SourceNode/SourceName (from `variable`), Status and ActionTimeStamp. The AuditEvent `Status` field is a + // PropertyState (true=succeeded / false=failed), so the failed device write is recorded as `false`; + // the device's textual Reason is carried in the Message (the StatusCode itself has no audit-field home). + audit.Initialize( + SystemContext, + source: variable, + severity: EventSeverity.Medium, + message: new LocalizedText($"Inbound write rejected by device: {reason}"), + status: false, + actionTimestamp: DateTime.UtcNow); + + // Write-specific fields (AuditWriteUpdateEventType). AttributeId = Value; IndexRange empty (full write); + // OldValue = the real pre-write value; NewValue = the value the client attempted. The AuditWriteUpdate + // child PropertyStates are NOT instantiated by Initialize — SetChildValue lazily creates + sets them + // (verified against the 1.5.378 SDK; it tolerates a null value, creating the child with Value=null). + audit.SetChildValue(SystemContext, BrowseNames.AttributeId, (uint)Attributes.Value, false); + audit.SetChildValue(SystemContext, BrowseNames.IndexRange, string.Empty, false); + audit.SetChildValue(SystemContext, BrowseNames.OldValue, priorValue!, false); + audit.SetChildValue(SystemContext, BrowseNames.NewValue, optimisticValue!, false); + + // Standard AuditEvent client-identity fields. ClientUserId is the writing principal (threaded from the + // handler); ClientAuditEntryId carries the SDK context's audit entry id when present. + if (clientUserId is not null) audit.SetChildValue(SystemContext, BrowseNames.ClientUserId, clientUserId, false); + var auditEntryId = SystemContext.AuditEntryId; + if (!string.IsNullOrEmpty(auditEntryId)) + audit.SetChildValue(SystemContext, BrowseNames.ClientAuditEntryId, auditEntryId, false); + + return audit; + } + + /// + /// Item C — report a built audit event through the SDK server, guarding against auditing being disabled + /// / no subscribers / a transient server-event-path failure. A failure here MUST NOT break the revert + /// that already happened, so it is swallowed and logged to the SDK trace (this node manager has no + /// ILogger) rather than propagated. Reported with the node manager's SystemContext, which + /// is what the alarm event path uses too. + /// + /// The populated audit event-state to report. + private void ReportAuditEvent(AuditWriteUpdateEventState auditEvent) + { + try + { + Server.ReportEvent(SystemContext, auditEvent); + } + catch (Exception ex) + { + // Auditing disabled / no monitored items / server shutting down ⇒ ReportEvent may no-op or throw; + // either way the revert already stands. Surface a recurring failure in the SDK trace, don't rethrow. +#pragma warning disable CS0618 // Utils.LogError is [Obsolete] in favour of an ITelemetryContext this manager doesn't carry. + Utils.LogError(ex, "OtOpcUaNodeManager: failed to report AuditWriteUpdateEvent for {0}", auditEvent.SourceNode?.Value); +#pragma warning restore CS0618 } } diff --git a/tests/Server/ZB.MOM.WW.OtOpcUa.OpcUaServer.Tests/EquipmentWriteGateTests.cs b/tests/Server/ZB.MOM.WW.OtOpcUa.OpcUaServer.Tests/EquipmentWriteGateTests.cs index 06270e36..b29d97a6 100644 --- a/tests/Server/ZB.MOM.WW.OtOpcUa.OpcUaServer.Tests/EquipmentWriteGateTests.cs +++ b/tests/Server/ZB.MOM.WW.OtOpcUa.OpcUaServer.Tests/EquipmentWriteGateTests.cs @@ -106,6 +106,78 @@ public sealed class EquipmentWriteGateTests OtOpcUaNodeManager.ShouldRevert(fail, currentNodeValue: null, optimisticValue: "v").ShouldBeFalse(); } + // ───────────────────────────── Item A — EvaluateEquipmentWriteStructure ───────────────────────────── + + /// (A1) A null value write to a value variable is rejected synchronously with + /// BadTypeMismatch (the minimum sensible structural check — a value node always holds a typed + /// payload, so a null write is never valid and fails fast inline rather than optimistic-then-revert). + [Fact] + public void Structure_null_value_is_rejected_with_type_mismatch() + { + var node = ValueNode(DataTypeIds.Int32); + var result = OtOpcUaNodeManager.EvaluateEquipmentWriteStructure(value: null, node); + + result.ShouldNotBeNull(); + result!.StatusCode.Code.ShouldBe(StatusCodes.BadTypeMismatch); + } + + /// (A2) A type-matching value (Int32 payload into an Int32 node) proceeds (returns null). + [Fact] + public void Structure_matching_type_proceeds() + { + var node = ValueNode(DataTypeIds.Int32); + OtOpcUaNodeManager.EvaluateEquipmentWriteStructure(value: 42, node).ShouldBeNull(); + } + + /// (A3) Numeric-to-numeric is treated as compatible (the SDK widens/narrows numerics) — a + /// Double payload into an Int32 node proceeds rather than being rejected here. + [Fact] + public void Structure_numeric_to_numeric_proceeds() + { + var node = ValueNode(DataTypeIds.Int32); + OtOpcUaNodeManager.EvaluateEquipmentWriteStructure(value: 3.5d, node).ShouldBeNull(); + } + + /// (A4) A confident cross-family mismatch (a String payload into a Boolean node) is rejected + /// with BadTypeMismatch via the cheap built-in-type compatibility check. + [Fact] + public void Structure_cross_family_mismatch_is_rejected() + { + var node = ValueNode(DataTypeIds.Boolean); + var result = OtOpcUaNodeManager.EvaluateEquipmentWriteStructure(value: "not-a-bool", node); + + result.ShouldNotBeNull(); + result!.StatusCode.Code.ShouldBe(StatusCodes.BadTypeMismatch); + } + + /// (A5) Confidence-gated defer: a node whose DataType is the abstract BaseDataType + /// wildcard (unresolved built-in type) proceeds for ANY non-null payload — the SDK's own coercion stays + /// authoritative. A "fallback" equipment tag (unknown DataType string) materialises as exactly this. + [Fact] + public void Structure_unresolved_datatype_defers_to_sdk() + { + var node = ValueNode(DataTypeIds.BaseDataType); + OtOpcUaNodeManager.EvaluateEquipmentWriteStructure(value: "anything", node).ShouldBeNull(); + } + + /// (A6) A non-variable node (defensive) is never rejected on type — only the null-payload check + /// applies. A non-null write to a plain proceeds. + [Fact] + public void Structure_non_variable_node_defers() + { + var folder = new FolderState(null) { NodeId = new NodeId("f", 2), BrowseName = new QualifiedName("f", 2) }; + OtOpcUaNodeManager.EvaluateEquipmentWriteStructure(value: 1, folder).ShouldBeNull(); + } + + private static BaseDataVariableState ValueNode(NodeId dataType) => new(null) + { + NodeId = new NodeId("eq/x", 2), + BrowseName = new QualifiedName("x", 2), + DisplayName = "x", + DataType = dataType, + ValueRank = ValueRanks.Scalar, + }; + private static RoleCarryingUserIdentity IdentityWith(params string[] roles) => new(new UserNameIdentityToken { UserName = "op" }, roles); }