feat(opcua): surface failed inbound writes to clients (fail-fast, Bad blip, audit event)

Three deferred 'surface the failed write' enhancements on the write-outcome self-correction path in OtOpcUaNodeManager: - Item A: synchronous structural fail-fast. EvaluateEquipmentWriteStructure (pure static) rejects a structurally-invalid write INLINE (Bad sync) after the authz gate but before the optimistic dispatch, so the SDK never applies it. Null payload -> BadTypeMismatch; plus a confidence-gated cheap built-in type compatibility check (numeric widening + BaseDataType wildcard tolerated; uncertain cases defer to the SDK's own coercion). - Item B: Bad-quality blip on device-write failure. On a revert, RevertOptimisticWriteIfNeeded first publishes the still-applied optimistic value with StatusCode BadDeviceFailure, then restores the prior value/status (both under the existing Lock). Documents the queue-coalescing caveat (a slow subscriber may see only the restored value -> the audit event is the reliable signal). - Item C: Part 8 AuditWriteUpdateEvent on device-write failure. Builds an AuditWriteUpdateEventState (SourceNode=node, AttributeId=Value, OldValue=prior, NewValue=attempted, ClientUserId from the threaded identity, Message carries outcome.Reason) under Lock and reports it via Server.ReportEvent OUTSIDE Lock. Guarded so auditing-disabled / report failure never breaks the revert. Threads the writing identity's user-id + node into the continuation. Adds 6 unit tests for EvaluateEquipmentWriteStructure. Build clean (0 warnings); 158/158 OpcUaServer.Tests green.
2026-06-15 02:38:57 -04:00
parent dcb0be650e
commit bb59fd4e75
2 changed files with 317 additions and 15 deletions
@@ -678,15 +678,24 @@ public sealed class OtOpcUaNodeManager : CustomNodeManager2
    ///         <see cref="ServiceResult.Good"/> lets the SDK apply the written value optimistically.
    ///     </para>
    ///     <para>
+    ///         <b>Item A — synchronous structural fail-fast.</b> After the authz gate passes but BEFORE the
+    ///         optimistic dispatch, a pure <see cref="EvaluateEquipmentWriteStructure"/> pre-check rejects a
+    ///         structurally-invalid write (e.g. a <c>null</c> payload, or a confidently-detected built-in-type
+    ///         mismatch) INLINE — returning Bad synchronously so the SDK never applies it, avoiding the
+    ///         optimistic-Good-then-revert round-trip + a pointless device dispatch.
+    ///     </para>
+    ///     <para>
    ///         <b>Write-outcome self-correction.</b> Before returning Good (which makes the SDK overwrite the
    ///         node with <paramref name="value"/>) we capture both the optimistic value AND the node's REAL
-    ///         prior value/status — at handler entry the node still holds the prior value. An off-Lock
-    ///         continuation on the <see cref="NodeWriteOutcome"/> then reverts the node to that prior
-    ///         value/status on a FAILED outcome, but ONLY while the node still holds the optimistic value, so a
-    ///         fresh driver poll that already republished the confirmed register value is not clobbered
-    ///         (<see cref="RevertOptimisticWriteIfNeeded"/> / <see cref="ShouldRevert"/>). On success the
-    ///         optimistic value stands and the next poll re-confirms it via the normal <see cref="WriteValue"/>
-    ///         path.
+    ///         prior value/status — at handler entry the node still holds the prior value — plus the writing
+    ///         principal's user-id (threaded to the audit event). An off-Lock continuation on the
+    ///         <see cref="NodeWriteOutcome"/> then, on a FAILED outcome and ONLY while the node still holds the
+    ///         optimistic value (so a fresh driver poll that already republished the confirmed register value is
+    ///         not clobbered): surfaces a transient <b>Bad-quality blip</b> (Item B), reverts the node to its
+    ///         prior value/status, and raises a Part 8 <b>AuditWriteUpdateEvent</b> (Item C) recording the
+    ///         rejected write (<see cref="RevertOptimisticWriteIfNeeded"/> / <see cref="ShouldRevert"/>). On
+    ///         success the optimistic value stands and the next poll re-confirms it via the normal
+    ///         <see cref="WriteValue"/> path.
    ///     </para>
    /// </summary>
    private ServiceResult OnEquipmentTagWrite(
@@ -698,6 +707,13 @@ public sealed class OtOpcUaNodeManager : CustomNodeManager2
        var gate = EvaluateEquipmentWriteGate(identity, gateway is not NullOpcUaNodeWriteGateway);
        if (gate is not null) return gate;

+        // Item A (synchronous structural fail-fast): reject a structurally-invalid write INLINE — return Bad
+        // synchronously so the SDK never applies it (no optimistic-Good-then-revert round-trip + no needless
+        // device dispatch). Runs AFTER the authz gate (so we never leak structure detail to an unauthorised
+        // caller) but BEFORE the optimistic dispatch below.
+        var structure = EvaluateEquipmentWriteStructure(value, node);
+        if (structure is not null) return structure;
+
        // Capture the optimistic value + the REAL prior value/status BEFORE the SDK applies the write
        // (at handler entry the node still holds the prior value; returning Good makes the SDK apply `value`).
        var optimisticValue = value;
@@ -709,6 +725,11 @@ public sealed class OtOpcUaNodeManager : CustomNodeManager2
            priorValue = variable.Value;
            priorStatus = variable.StatusCode;
        }
+        // Item C: thread the writing principal's user-id string into the failure continuation so the audit
+        // event can populate ClientUserId. Resolved here off the same identity the gate used (null when the
+        // session is anonymous / carries no role-carrying identity — the gate would already have vetoed, so in
+        // practice non-null on this path, but kept defensive).
+        var clientUserId = identity?.DisplayName;

        // Fire-and-forget — MUST NOT block under Lock. On a FAILED outcome, compare-and-revert (off-Lock
        // continuation). A faulted/cancelled WriteAsync is treated as a failure so the optimistic value never
@@ -721,7 +742,7 @@ public sealed class OtOpcUaNodeManager : CustomNodeManager2
                t =>
                {
                    var outcome = t.IsCompletedSuccessfully ? t.Result : new NodeWriteOutcome(false, "write dispatch faulted");
-                    RevertOptimisticWriteIfNeeded(nodeKey, outcome, optimisticValue, priorValue, priorStatus);
+                    RevertOptimisticWriteIfNeeded(nodeKey, outcome, optimisticValue, priorValue, priorStatus, clientUserId);
                },
                CancellationToken.None, TaskContinuationOptions.RunContinuationsAsynchronously, TaskScheduler.Default);

@@ -762,6 +783,92 @@ public sealed class OtOpcUaNodeManager : CustomNodeManager2
        return null;
    }

+    /// <summary>
+    ///     <b>Item A — synchronous structural fail-fast.</b> Pure structural pre-check for an inbound
+    ///     equipment-tag write, run AFTER <see cref="EvaluateEquipmentWriteGate"/> passes but BEFORE the
+    ///     optimistic device dispatch in <see cref="OnEquipmentTagWrite"/>. A structurally-invalid write is
+    ///     rejected INLINE (a Bad <see cref="ServiceResult"/> is returned synchronously, so the SDK never
+    ///     applies the value) instead of being optimistically applied and later reverted — saving both the
+    ///     phantom value-blip and a pointless device round-trip.
+    ///     <para>
+    ///         <b>Interpretation / tradeoff (the item was deliberately under-specified).</b> The minimum
+    ///         sensible structural check is a <c>null</c> value write to a value variable ⇒
+    ///         <c>BadTypeMismatch</c> (a value node always holds a typed scalar/array; a null payload is never a
+    ///         valid value write here). On top of that, when the node is a <see cref="BaseDataVariableState"/>
+    ///         whose <see cref="BaseVariableState.DataType"/> resolves to a concrete built-in type AND the
+    ///         written value's runtime built-in type is also resolvable, a CHEAP built-in-type compatibility
+    ///         check is applied: a clear mismatch ⇒ <c>BadTypeMismatch</c>. The check is intentionally
+    ///         conservative — it only rejects when BOTH the expected and actual built-in types are confidently
+    ///         resolved AND differ (with numeric widening + the BaseDataType "accept anything" wildcard
+    ///         allowed); anything uncertain (a non-variable node, an unresolved/abstract DataType, a
+    ///         <see cref="Variant"/>/array payload whose element type isn't cheaply known) is allowed through so
+    ///         the SDK's own (authoritative) type coercion in <c>BaseVariableState.WriteValue</c> remains the
+    ///         final arbiter. We deliberately do NOT attempt deep array-dimension / structured-type validation
+    ///         here — that is left to the SDK.
+    ///     </para>
+    ///     Pure (no SDK server / Lock needed): reads only <paramref name="value"/> and the node's static
+    ///     <c>DataType</c>, so it is unit-testable in isolation.
+    /// </summary>
+    /// <param name="value">The value the client wrote (the SDK's pre-coercion payload).</param>
+    /// <param name="node">The target node (expected to be a writable <see cref="BaseDataVariableState"/>).</param>
+    /// <returns><c>null</c> to proceed; otherwise the veto <see cref="ServiceResult"/>
+    /// (<c>BadTypeMismatch</c> for a null write or a confidently-detected built-in-type mismatch).</returns>
+    internal static ServiceResult? EvaluateEquipmentWriteStructure(object? value, NodeState node)
+    {
+        // Minimum sensible check: a null payload is never a valid value write to a value variable.
+        if (value is null)
+        {
+            return new ServiceResult(StatusCodes.BadTypeMismatch, "null value write rejected");
+        }
+
+        // Cheap, confidence-gated built-in-type compatibility check. Only when the node is a value variable
+        // with a concretely-resolvable expected built-in type AND the payload's built-in type is also cheaply
+        // resolvable do we compare; otherwise proceed and let the SDK's WriteValue coercion be authoritative.
+        if (node is not BaseDataVariableState variable) return null;
+        var expected = TypeInfo.GetBuiltInType(variable.DataType); // NodeId ⇒ built-in; abstract/unknown ⇒ Null
+        if (expected is BuiltInType.Null or BuiltInType.Variant or BuiltInType.DataValue) return null; // unresolved / wildcard
+
+        // TypeInfo.Construct(object) classifies the runtime payload; an unclassifiable value ⇒ Unknown (Null).
+        var actual = TypeInfo.Construct(value).BuiltInType;
+        if (actual == BuiltInType.Null) return null; // couldn't classify the payload ⇒ defer to the SDK
+
+        if (!IsBuiltInTypeCompatible(expected, actual))
+        {
+            return new ServiceResult(
+                StatusCodes.BadTypeMismatch,
+                $"value built-in type {actual} is not compatible with node data type {expected}");
+        }
+
+        return null;
+    }
+
+    /// <summary>
+    ///     Conservative built-in-type compatibility test for <see cref="EvaluateEquipmentWriteStructure"/>.
+    ///     Returns true (compatible — allow through) unless there is a confident mismatch. Exact matches pass;
+    ///     numeric-to-numeric is treated as compatible (the SDK widens/narrows numerics); a
+    ///     <see cref="BuiltInType.ByteString"/>↔<see cref="BuiltInType.Byte"/> array nuance and any
+    ///     <see cref="BuiltInType.Variant"/> on either side are treated as compatible. Only a clear cross-family
+    ///     mismatch (e.g. writing a String to a Boolean node) returns false. Pure + static.
+    /// </summary>
+    private static bool IsBuiltInTypeCompatible(BuiltInType expected, BuiltInType actual)
+    {
+        if (expected == actual) return true;
+        // Any side a wildcard/unclassified ⇒ defer (compatible) — the caller already filtered most of these.
+        if (expected is BuiltInType.Variant or BuiltInType.Null) return true;
+        if (actual is BuiltInType.Variant or BuiltInType.Null) return true;
+        // Numeric family widening/narrowing is the SDK's job; treat numeric↔numeric as compatible.
+        if (IsNumeric(expected) && IsNumeric(actual)) return true;
+        // ByteString and Byte are routinely interchangeable on the wire; don't reject that pairing.
+        if ((expected, actual) is (BuiltInType.ByteString, BuiltInType.Byte) or (BuiltInType.Byte, BuiltInType.ByteString)) return true;
+        return false;
+    }
+
+    /// <summary>True for the OPC UA numeric built-in types (the integer + floating families).</summary>
+    private static bool IsNumeric(BuiltInType t) => t is
+        BuiltInType.SByte or BuiltInType.Byte or BuiltInType.Int16 or BuiltInType.UInt16 or
+        BuiltInType.Int32 or BuiltInType.UInt32 or BuiltInType.Int64 or BuiltInType.UInt64 or
+        BuiltInType.Float or BuiltInType.Double;
+
    /// <summary>
    ///     Pure decision for the write-outcome self-correction: revert the node to its pre-write value ONLY on
    ///     a FAILED outcome AND only while the node still holds the optimistic value. The
@@ -780,28 +887,151 @@ public sealed class OtOpcUaNodeManager : CustomNodeManager2

    /// <summary>
    ///     Off-Lock continuation body for the write-outcome self-correction: re-takes <c>Lock</c> and, when
-    ///     <see cref="ShouldRevert"/> says so, reverts the node's Value + StatusCode to the captured pre-write
-    ///     value/status and notifies subscribers (same node-update shape as <see cref="WriteValue"/>). A
-    ///     no-op when the node was rebuilt/removed in the interim, when the outcome succeeded, or when a fresh
-    ///     poll already moved the node off the optimistic value. Silent — this node manager carries no logger;
-    ///     the gateway logs the underlying write failure.
+    ///     <see cref="ShouldRevert"/> says so, surfaces the device-write rejection to subscribed clients in three
+    ///     ways, then leaves the node holding its captured pre-write value/status (same node-update shape as
+    ///     <see cref="WriteValue"/>). A no-op when the node was rebuilt/removed in the interim, when the outcome
+    ///     succeeded, or when a fresh poll already moved the node off the optimistic value.
+    ///     <list type="number">
+    ///         <item>
+    ///             <b>Item B — Bad-quality blip.</b> Before restoring the prior value, the node is published
+    ///             once holding the (still-applied) optimistic value but with StatusCode
+    ///             <c>BadDeviceFailure</c>, then published again with the prior value/status restored. A
+    ///             value-subscribed client therefore sees the rejection (a Bad quality) rather than the value
+    ///             silently snapping back. <b>Caveat:</b> the two <c>ClearChangeMasks</c> calls happen
+    ///             back-to-back within one server publishing interval, so the SDK's monitored-item queue may
+    ///             COALESCE them — a slow / queue-size-1 subscriber can see only the final restored value and
+    ///             miss the transient Bad blip. The blip is a best-effort live signal; the
+    ///             <see cref="AuditWriteUpdateEventState"/> raised below (Item C) is the reliable, durable record
+    ///             of the rejected write.
+    ///         </item>
+    ///         <item>
+    ///             <b>Item C — AuditWriteUpdateEvent.</b> A Part 8 <see cref="AuditWriteUpdateEventState"/> is
+    ///             raised through the SDK <see cref="CustomNodeManager2.Server"/> so an auditing client gets a
+    ///             durable record of the rejected write (OldValue = prior, NewValue = the attempted optimistic
+    ///             value, the boolean AuditEvent Status = false ⇒ failed, and the device's
+    ///             <paramref name="outcome"/>.Reason carried in the event Message). It is reported
+    ///             OUTSIDE <c>Lock</c> (the event-state is built under Lock, then reported after release) to keep
+    ///             the lock hold short and avoid any re-entrancy risk via the server's event path. Auditing being
+    ///             disabled / no subscribers is handled gracefully — <c>ReportEvent</c> simply reaches no monitored
+    ///             items, and any failure is swallowed (logged to the SDK trace) so the revert is never broken by it.
+    ///         </item>
+    ///     </list>
+    ///     Silent value-wise — this node manager carries no logger; the gateway logs the underlying write failure
+    ///     and the SDK trace captures any audit-report failure.
    /// </summary>
    /// <param name="nodeId">The string id of the written variable node.</param>
    /// <param name="outcome">The device-write outcome routed back by the gateway.</param>
    /// <param name="optimisticValue">The value the SDK optimistically applied on the write.</param>
    /// <param name="priorValue">The node's real value captured before the optimistic write.</param>
    /// <param name="priorStatus">The node's real status captured before the optimistic write.</param>
+    /// <param name="clientUserId">The writing principal's user-id string (the identity's DisplayName), threaded
+    /// from <see cref="OnEquipmentTagWrite"/> to populate the audit event's ClientUserId; null when unknown.</param>
    private void RevertOptimisticWriteIfNeeded(
-        string nodeId, NodeWriteOutcome outcome, object? optimisticValue, object? priorValue, StatusCode priorStatus)
+        string nodeId, NodeWriteOutcome outcome, object? optimisticValue, object? priorValue, StatusCode priorStatus,
+        string? clientUserId)
    {
+        // Built under Lock if (and only if) a revert is performed, then reported AFTER Lock is released.
+        AuditWriteUpdateEventState? auditEvent = null;
+
        lock (Lock)
        {
            if (!_variables.TryGetValue(nodeId, out var variable)) return;       // rebuilt/removed ⇒ no-op
            if (!ShouldRevert(outcome, variable.Value, optimisticValue)) return; // success, or poll moved it on
+
+            // Item B: surface a transient Bad-quality blip on the still-applied optimistic value, then restore
+            // the prior value/status. Both publishes are under this single Lock hold (mirrors WriteValue's
+            // node-update shape). See the method remarks for the queue-coalescing caveat.
+            variable.StatusCode = StatusCodes.BadDeviceFailure;
+            variable.Timestamp = DateTime.UtcNow;
+            variable.ClearChangeMasks(SystemContext, includeChildren: false); // notify — the Bad blip
+
            variable.Value = priorValue;
            variable.StatusCode = priorStatus;
            variable.Timestamp = DateTime.UtcNow;
-            variable.ClearChangeMasks(SystemContext, includeChildren: false);
+            variable.ClearChangeMasks(SystemContext, includeChildren: false); // notify — restore prior
+
+            // Item C: build the audit event-state while we hold the node reference + Lock, but DON'T report yet.
+            auditEvent = BuildWriteFailureAuditEvent(variable, outcome, optimisticValue, priorValue, clientUserId);
+        }
+
+        // Report OUTSIDE Lock — keeps the hold short and sidesteps any re-entrancy through the server event path.
+        if (auditEvent is not null) ReportAuditEvent(auditEvent);
+    }
+
+    /// <summary>
+    ///     Item C — build (but do not report) a Part 8 <see cref="AuditWriteUpdateEventState"/> recording a
+    ///     rejected device write on <paramref name="variable"/>. The caller holds <c>Lock</c> (the node is read
+    ///     here); reporting happens after Lock release. The standard AuditEvent envelope (EventId, EventType,
+    ///     Time/ReceiveTime, ServerId, Severity, Message, SourceNode/SourceName, Status, ActionTimeStamp) is
+    ///     stamped by the SDK's <see cref="AuditEventState.Initialize(ISystemContext, NodeState, EventSeverity,
+    ///     LocalizedText, bool, DateTime)"/> helper; this method then fills the write-specific fields
+    ///     (AttributeId = Value, IndexRange = empty, OldValue = prior, NewValue = attempted) plus ClientUserId.
+    /// </summary>
+    /// <param name="variable">The written node (the audit event's SourceNode).</param>
+    /// <param name="outcome">The failed device-write outcome (its Reason goes into the event Message + Status).</param>
+    /// <param name="optimisticValue">The value the client attempted (the audit NewValue).</param>
+    /// <param name="priorValue">The node's real pre-write value (the audit OldValue).</param>
+    /// <param name="clientUserId">The writing principal's user-id string; null when unknown.</param>
+    /// <returns>A populated, unreported <see cref="AuditWriteUpdateEventState"/>.</returns>
+    private AuditWriteUpdateEventState BuildWriteFailureAuditEvent(
+        BaseDataVariableState variable, NodeWriteOutcome outcome, object? optimisticValue, object? priorValue,
+        string? clientUserId)
+    {
+        var reason = string.IsNullOrEmpty(outcome.Reason) ? "device write rejected" : outcome.Reason!;
+
+        var audit = new AuditWriteUpdateEventState(null);
+        // Initialize stamps EventId (fresh GUID bytes), EventType, Time/ReceiveTime, ServerId, Severity, Message,
+        // SourceNode/SourceName (from `variable`), Status and ActionTimeStamp. The AuditEvent `Status` field is a
+        // PropertyState<bool> (true=succeeded / false=failed), so the failed device write is recorded as `false`;
+        // the device's textual Reason is carried in the Message (the StatusCode itself has no audit-field home).
+        audit.Initialize(
+            SystemContext,
+            source: variable,
+            severity: EventSeverity.Medium,
+            message: new LocalizedText($"Inbound write rejected by device: {reason}"),
+            status: false,
+            actionTimestamp: DateTime.UtcNow);
+
+        // Write-specific fields (AuditWriteUpdateEventType). AttributeId = Value; IndexRange empty (full write);
+        // OldValue = the real pre-write value; NewValue = the value the client attempted. The AuditWriteUpdate
+        // child PropertyStates are NOT instantiated by Initialize — SetChildValue lazily creates + sets them
+        // (verified against the 1.5.378 SDK; it tolerates a null value, creating the child with Value=null).
+        audit.SetChildValue(SystemContext, BrowseNames.AttributeId, (uint)Attributes.Value, false);
+        audit.SetChildValue(SystemContext, BrowseNames.IndexRange, string.Empty, false);
+        audit.SetChildValue(SystemContext, BrowseNames.OldValue, priorValue!, false);
+        audit.SetChildValue(SystemContext, BrowseNames.NewValue, optimisticValue!, false);
+
+        // Standard AuditEvent client-identity fields. ClientUserId is the writing principal (threaded from the
+        // handler); ClientAuditEntryId carries the SDK context's audit entry id when present.
+        if (clientUserId is not null) audit.SetChildValue(SystemContext, BrowseNames.ClientUserId, clientUserId, false);
+        var auditEntryId = SystemContext.AuditEntryId;
+        if (!string.IsNullOrEmpty(auditEntryId))
+            audit.SetChildValue(SystemContext, BrowseNames.ClientAuditEntryId, auditEntryId, false);
+
+        return audit;
+    }
+
+    /// <summary>
+    ///     Item C — report a built audit event through the SDK server, guarding against auditing being disabled
+    ///     / no subscribers / a transient server-event-path failure. A failure here MUST NOT break the revert
+    ///     that already happened, so it is swallowed and logged to the SDK trace (this node manager has no
+    ///     <c>ILogger</c>) rather than propagated. Reported with the node manager's <c>SystemContext</c>, which
+    ///     is what the alarm event path uses too.
+    /// </summary>
+    /// <param name="auditEvent">The populated audit event-state to report.</param>
+    private void ReportAuditEvent(AuditWriteUpdateEventState auditEvent)
+    {
+        try
+        {
+            Server.ReportEvent(SystemContext, auditEvent);
+        }
+        catch (Exception ex)
+        {
+            // Auditing disabled / no monitored items / server shutting down ⇒ ReportEvent may no-op or throw;
+            // either way the revert already stands. Surface a recurring failure in the SDK trace, don't rethrow.
+#pragma warning disable CS0618 // Utils.LogError is [Obsolete] in favour of an ITelemetryContext this manager doesn't carry.
+            Utils.LogError(ex, "OtOpcUaNodeManager: failed to report AuditWriteUpdateEvent for {0}", auditEvent.SourceNode?.Value);
+#pragma warning restore CS0618
        }
    }