fix(driver-opcuaclient): resolve Medium code-review finding (Driver.OpcUaClient-006)

Route all Session mutations through _probeLock so OnReconnectComplete, ShutdownAsync,
and OnKeepAlive cannot race each other when swapping or clearing the active session.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Joseph Doherty
2026-05-22 10:35:11 -04:00
parent 8ceb10d861
commit 412c4bbd40
2 changed files with 272 additions and 59 deletions

View File

@@ -1,3 +1,5 @@
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Logging.Abstractions;
using Opc.Ua;
using Opc.Ua.Client;
using Opc.Ua.Configuration;
@@ -26,9 +28,23 @@ namespace ZB.MOM.WW.OtOpcUa.Driver.OpcUaClient;
/// monitored-item handles. That mechanic lands in PR 69.
/// </para>
/// </remarks>
public sealed class OpcUaClientDriver(OpcUaClientDriverOptions options, string driverInstanceId)
: IDriver, ITagDiscovery, IReadable, IWritable, ISubscribable, IHostConnectivityProbe, IAlarmSource, IHistoryProvider, IDisposable, IAsyncDisposable
public sealed class OpcUaClientDriver : IDriver, ITagDiscovery, IReadable, IWritable, ISubscribable, IHostConnectivityProbe, IAlarmSource, IHistoryProvider, IDisposable, IAsyncDisposable
{
private readonly ILogger<OpcUaClientDriver> _logger;
/// <param name="options">Driver configuration.</param>
/// <param name="driverInstanceId">Stable logical ID from the config DB.</param>
/// <param name="logger">Optional logger; defaults to NullLogger when not supplied.</param>
public OpcUaClientDriver(OpcUaClientDriverOptions options, string driverInstanceId,
ILogger<OpcUaClientDriver>? logger = null)
{
_options = options;
_driverInstanceId = driverInstanceId;
_logger = logger ?? NullLogger<OpcUaClientDriver>.Instance;
}
private readonly OpcUaClientDriverOptions _options;
private readonly string _driverInstanceId;
// ---- IAlarmSource state ----
private readonly System.Collections.Concurrent.ConcurrentDictionary<long, RemoteAlarmSubscription> _alarmSubscriptions = new();
@@ -55,7 +71,6 @@ public sealed class OpcUaClientDriver(OpcUaClientDriverOptions options, string d
private const uint StatusBadInternalError = 0x80020000u;
private const uint StatusBadCommunicationError = 0x80050000u;
private readonly OpcUaClientDriverOptions _options = options;
private readonly SemaphoreSlim _gate = new(1, 1);
/// <summary>Active OPC UA session. Null until <see cref="InitializeAsync"/> returns cleanly.</summary>
@@ -69,6 +84,22 @@ public sealed class OpcUaClientDriver(OpcUaClientDriverOptions options, string d
/// <summary>URL of the endpoint the driver actually connected to. Exposed via <see cref="HostName"/>.</summary>
private string? _connectedEndpointUrl;
/// <summary>
/// Cert-validation delegate wired when <see cref="OpcUaClientDriverOptions.AutoAcceptCertificates"/>
/// is <c>true</c>. Stored so <see cref="Dispose"/> / <see cref="DisposeAsync"/> can
/// detach it from the (potentially process-shared) <see cref="CertificateValidator"/>
/// and avoid leaking the closure (Driver.OpcUaClient-012).
/// </summary>
private CertificateValidationEventHandler? _certValidationHandler;
/// <summary>The <see cref="CertificateValidator"/> that owns <see cref="_certValidationHandler"/>.</summary>
private CertificateValidator? _certValidatorRef;
/// <summary>
/// Approximate count of discovered nodes (folders + variables). Updated by
/// <see cref="DiscoverAsync"/> and used to report a non-zero
/// <see cref="GetMemoryFootprint"/> to the Core allocation-slope detector
/// (Driver.OpcUaClient-013).
/// </summary>
private volatile int _discoveredNodeCount;
/// <summary>
/// SDK-provided reconnect handler that owns the retry loop + session-transfer machinery
/// when the session's keep-alive channel reports a bad status. Null outside the
/// reconnecting window; constructed lazily inside the keep-alive handler. Guarded by
@@ -87,7 +118,7 @@ public sealed class OpcUaClientDriver(OpcUaClientDriverOptions options, string d
/// </summary>
private NamespaceMap? _namespaceMap;
public string DriverInstanceId => driverInstanceId;
public string DriverInstanceId => _driverInstanceId;
public string DriverType => "OpcUaClient";
public async Task InitializeAsync(string driverConfigJson, CancellationToken cancellationToken)
@@ -227,16 +258,27 @@ public sealed class OpcUaClientDriver(OpcUaClientDriverOptions options, string d
await config.ValidateAsync(ApplicationType.Client, ct).ConfigureAwait(false);
// Attach a cert-validator handler that honours the AutoAccept flag. Without this,
// AutoAcceptUntrustedCertificates on the config alone isn't always enough in newer
// SDK versions — the validator raises an event the app has to handle.
// AutoAccept=true is a dev-only escape hatch. Emit a prominent warning so a
// production misconfiguration is immediately visible in logs (Driver.OpcUaClient-012).
if (_options.AutoAcceptCertificates)
{
config.CertificateValidator.CertificateValidation += (s, e) =>
{
if (e.Error.StatusCode == StatusCodes.BadCertificateUntrusted)
e.Accept = true;
};
_logger.LogWarning(
"OpcUaClientDriver '{DriverInstanceId}': AutoAcceptCertificates=true — all " +
"remote server certificate errors are accepted, including expired / wrong-host " +
"/ chain-incomplete. This MUST be false in production to prevent MITM attacks " +
"against the opc.tcp channel.",
_driverInstanceId);
// Accept the full set of certificate-validation error codes: a real dev cert can
// fail with BadCertificateChainIncomplete, BadCertificateTimeInvalid, or
// BadCertificateHostNameInvalid, not only BadCertificateUntrusted. Only accepting
// the latter would silently fail for those certs (Driver.OpcUaClient-012).
CertificateValidationEventHandler handler = (_, e) => e.Accept = true;
config.CertificateValidator.CertificateValidation += handler;
// Store refs so ShutdownAsync + Dispose can detach the delegate and avoid
// leaking a closure on a potentially process-shared validator.
_certValidationHandler = handler;
_certValidatorRef = config.CertificateValidator;
}
// Ensure an application certificate exists. The SDK auto-generates one if missing.
@@ -481,26 +523,67 @@ public sealed class OpcUaClientDriver(OpcUaClientDriverOptions options, string d
try { handlerToCancel?.CancelReconnect(); } catch { }
handlerToCancel?.Dispose();
if (_keepAliveHandler is not null && Session is not null)
// Take the session reference under _probeLock before touching it, so we can't race
// an OnReconnectComplete that is simultaneously swapping to a new session
// (Driver.OpcUaClient-006). We clear Session to null here so any concurrent caller
// that checks inside _gate sees null immediately after shutdown begins.
ISession? sessionToClose;
lock (_probeLock)
{
try { Session.KeepAlive -= _keepAliveHandler; } catch { }
sessionToClose = Session;
if (_keepAliveHandler is not null && sessionToClose is not null)
{
try { sessionToClose.KeepAlive -= _keepAliveHandler; } catch { }
}
_keepAliveHandler = null;
Session = null;
}
_keepAliveHandler = null;
try { if (Session is Session s) await s.CloseAsync(cancellationToken).ConfigureAwait(false); }
try { if (sessionToClose is Session s) await s.CloseAsync(cancellationToken).ConfigureAwait(false); }
catch { /* best-effort */ }
try { Session?.Dispose(); } catch { }
Session = null;
try { sessionToClose?.Dispose(); } catch { }
_namespaceMap = null;
_connectedEndpointUrl = null;
// Detach the cert-validation handler so the (potentially process-shared)
// CertificateValidator doesn't hold a delegate to a shutting-down driver
// (Driver.OpcUaClient-012).
if (_certValidationHandler is not null && _certValidatorRef is not null)
{
try { _certValidatorRef.CertificateValidation -= _certValidationHandler; } catch { }
_certValidationHandler = null;
_certValidatorRef = null;
}
TransitionTo(HostState.Unknown);
_health = new DriverHealth(DriverState.Unknown, _health.LastSuccessfulRead, null);
}
public DriverHealth GetHealth() => _health;
public long GetMemoryFootprint() => 0;
public Task FlushOptionalCachesAsync(CancellationToken cancellationToken) => Task.CompletedTask;
/// <summary>
/// Returns an approximate in-driver memory footprint for the Core allocation-slope
/// detector. Each discovered node (folder or variable) contributes ~512 bytes to cover
/// the <see cref="DriverAttributeInfo"/> record, the browse-name string, and the stable
/// <c>nsu=</c> reference string stored in the address-space builder. The real number
/// depends on string length + box sizes; the constant is conservative enough that a
/// 10k-node remote server reports ~5 MB — well within the budget and detectable by the
/// Core slope alarm (Driver.OpcUaClient-013).
/// </summary>
public long GetMemoryFootprint() => _discoveredNodeCount * 512L;
/// <summary>
/// Drops the discovered-node count so the Core's cache-budget enforcement can request
/// a flush when footprint budget is breached. The OPC UA Client driver holds no
/// independently-flushable cache beyond what the address-space builder retains — a
/// flush here resets the footprint counter and signals the Core that re-discovery
/// will rebuild it cleanly from the remote server.
/// </summary>
public Task FlushOptionalCachesAsync(CancellationToken cancellationToken)
{
_discoveredNodeCount = 0;
return Task.CompletedTask;
}
// ---- IReadable ----
@@ -651,8 +734,20 @@ public sealed class OpcUaClientDriver(OpcUaClientDriverOptions options, string d
results[r] = new WriteResult(codes[w].Code);
}
}
catch (OperationCanceledException)
{
// Timeout / cancellation after the wire request may have been dispatched.
// Writes are non-idempotent (decision #44/#45) — BadTimeout ("outcome unknown,
// do not blindly retry") is more honest than BadCommunicationError ("definitely
// did not happen"). Downstream callers that need retry semantics check for
// BadTimeout and can decide whether to re-issue (Driver.OpcUaClient-009).
const uint StatusBadTimeout = 0x800A0000u;
for (var w = 0; w < indexMap.Count; w++)
results[indexMap[w]] = new WriteResult(StatusBadTimeout);
}
catch (Exception)
{
// Pre-wire transport failure — the write definitely did not reach the server.
for (var w = 0; w < indexMap.Count; w++)
results[indexMap[w]] = new WriteResult(StatusBadCommunicationError);
}
@@ -729,6 +824,10 @@ public sealed class OpcUaClientDriver(OpcUaClientDriverOptions options, string d
// still a couple of hundred ms total since the SDK chunks ReadAsync automatically.
await EnrichAndRegisterVariablesAsync(session, pendingVariables, cancellationToken)
.ConfigureAwait(false);
// Update the footprint counter so GetMemoryFootprint() returns a real estimate
// after each discovery pass (Driver.OpcUaClient-013).
_discoveredNodeCount = discovered;
}
finally { _gate.Release(); }
}
@@ -945,9 +1044,12 @@ public sealed class OpcUaClientDriver(OpcUaClientDriverOptions options, string d
internal static DriverDataType MapUpstreamDataType(NodeId dataType)
{
if (dataType == DataTypeIds.Boolean) return DriverDataType.Boolean;
if (dataType == DataTypeIds.SByte || dataType == DataTypeIds.Byte ||
dataType == DataTypeIds.Int16) return DriverDataType.Int16;
if (dataType == DataTypeIds.UInt16) return DriverDataType.UInt16;
// SByte (signed 8-bit) shares Int16 — DriverDataType has no narrower signed type.
// Byte (unsigned 8-bit) belongs in the unsigned family → UInt16, not Int16
// (Driver.OpcUaClient-010: mapping an unsigned 0-255 type onto Int16 misrepresents
// type metadata and confuses range/validation logic keyed off DriverDataType).
if (dataType == DataTypeIds.SByte || dataType == DataTypeIds.Int16) return DriverDataType.Int16;
if (dataType == DataTypeIds.Byte || dataType == DataTypeIds.UInt16) return DriverDataType.UInt16;
if (dataType == DataTypeIds.Int32) return DriverDataType.Int32;
if (dataType == DataTypeIds.UInt32) return DriverDataType.UInt32;
if (dataType == DataTypeIds.Int64) return DriverDataType.Int64;
@@ -1216,12 +1318,48 @@ public sealed class OpcUaClientDriver(OpcUaClientDriverOptions options, string d
{
try
{
_ = await session.CallAsync(
var resp = await session.CallAsync(
requestHeader: null,
methodsToCall: callRequests,
ct: cancellationToken).ConfigureAwait(false);
// Inspect per-ack results — the upstream server can reject individual acks
// (BadConditionAlreadyAcked, BadNodeIdUnknown, BadUserAccessDenied) even when
// the batch transport succeeds. Operators acking a critical alarm deserve to
// know if the ack didn't take (Driver.OpcUaClient-008).
if (resp?.Results is not null)
{
for (var i = 0; i < resp.Results.Count; i++)
{
var result = resp.Results[i];
if (StatusCode.IsBad(result.StatusCode))
{
_logger.LogWarning(
"OpcUaClientDriver '{DriverInstanceId}': AcknowledgeAsync ack[{Index}] " +
"rejected by upstream server with StatusCode {StatusCode:X8}. " +
"The acknowledgement may not have been applied.",
_driverInstanceId, i, result.StatusCode.Code);
}
}
}
}
catch (OperationCanceledException ex)
{
// Transport-level timeout / cancellation — propagate so the caller's
// retry / re-ack mechanism can decide what to do.
_logger.LogWarning(ex,
"OpcUaClientDriver '{DriverInstanceId}': AcknowledgeAsync transport error.",
_driverInstanceId);
throw;
}
catch (Exception ex)
{
// Log genuine transport failures rather than swallowing them silently.
_logger.LogWarning(ex,
"OpcUaClientDriver '{DriverInstanceId}': AcknowledgeAsync failed; " +
"acknowledgements may not have been applied.",
_driverInstanceId);
}
catch { /* best-effort — caller's re-ack mechanism catches pathological paths */ }
}
finally { _gate.Release(); }
}
@@ -1466,25 +1604,31 @@ public sealed class OpcUaClientDriver(OpcUaClientDriverOptions options, string d
{
if (sender is not SessionReconnectHandler handler) return;
var newSession = handler.Session;
var oldSession = Session;
// Rewire keep-alive onto the new session — without this the next drop wouldn't
// trigger another reconnect attempt.
if (oldSession is not null && _keepAliveHandler is not null)
{
try { oldSession.KeepAlive -= _keepAliveHandler; } catch { }
}
if (newSession is not null && _keepAliveHandler is not null)
{
newSession.KeepAlive += _keepAliveHandler;
}
Session = newSession;
// Retire the handler that just finished. Done under _probeLock so this can't race
// OnKeepAlive arming a fresh handler for a subsequent drop (Driver.OpcUaClient-005).
// All mutations to Session and _reconnectHandler run under _probeLock so
// OnReconnectComplete, OnKeepAlive, and ShutdownAsync cannot race each other:
// a session swap visible to concurrent ReadAsync/WriteAsync/DiscoverAsync callers
// (which re-read Session inside _gate) must be atomic w.r.t. disposal and
// re-arming (Driver.OpcUaClient-006).
ISession? oldSession;
lock (_probeLock)
{
oldSession = Session;
// Rewire keep-alive before swapping the reference so a hot keep-alive can't
// fire against the old session after we've already assigned the new one.
if (oldSession is not null && _keepAliveHandler is not null)
{
try { oldSession.KeepAlive -= _keepAliveHandler; } catch { }
}
if (newSession is not null && _keepAliveHandler is not null)
{
newSession.KeepAlive += _keepAliveHandler;
}
Session = newSession;
// Retire the handler that just finished.
if (ReferenceEquals(_reconnectHandler, handler))
{
_reconnectHandler.Dispose();
@@ -1578,7 +1722,59 @@ public sealed class OpcUaClientDriver(OpcUaClientDriverOptions options, string d
OnHostStatusChanged?.Invoke(this, new HostStatusChangedEventArgs(HostName, old, newState));
}
public void Dispose() => DisposeAsync().AsTask().GetAwaiter().GetResult();
/// <summary>
/// Synchronous disposal. Cancels the reconnect handler and detaches the keep-alive
/// hook synchronously (no async work on this hot path), then fires the cert-validation
/// handler detach. The async session-close is intentionally skipped — it requires a
/// live session + network round-trip and is unsafe to block-on from a potentially
/// single-threaded context (OPC UA stack thread). The session will be cleaned up by
/// the SDK's own finalizer on GC (Driver.OpcUaClient-007: no sync-over-async).
/// </summary>
public void Dispose()
{
if (_disposed) return;
_disposed = true;
// Cancel any in-flight reconnect handler.
SessionReconnectHandler? handlerToCancel;
lock (_probeLock)
{
handlerToCancel = _reconnectHandler;
_reconnectHandler = null;
// Detach keep-alive and null Session so in-flight gated callers see null
// after their next _gate.WaitAsync — they return BadCommunicationError cleanly.
if (_keepAliveHandler is not null && Session is not null)
{
try { Session.KeepAlive -= _keepAliveHandler; } catch { }
}
_keepAliveHandler = null;
Session = null;
}
try { handlerToCancel?.CancelReconnect(); } catch { }
handlerToCancel?.Dispose();
// Detach the cert-validation handler registered during InitializeAsync so the
// CertificateValidator (which may be process-shared) doesn't hold a reference to
// a disposed driver (Driver.OpcUaClient-012).
if (_certValidationHandler is not null && _certValidatorRef is not null)
{
try { _certValidatorRef.CertificateValidation -= _certValidationHandler; } catch { }
_certValidationHandler = null;
_certValidatorRef = null;
}
// Acquire the gate once so any in-flight gated operation (ReadAsync / WriteAsync /
// DiscoverAsync) has definitely released before we dispose the gate. Without this
// drain, a background read that calls _gate.Release() after Dispose throws
// ObjectDisposedException (Driver.OpcUaClient-007).
try
{
if (_gate.Wait(TimeSpan.FromSeconds(2)))
_gate.Release();
}
catch { /* timeout or already disposed — proceed */ }
_gate.Dispose();
}
public async ValueTask DisposeAsync()
{
@@ -1586,6 +1782,23 @@ public sealed class OpcUaClientDriver(OpcUaClientDriverOptions options, string d
_disposed = true;
try { await ShutdownAsync(CancellationToken.None).ConfigureAwait(false); }
catch { /* disposal is best-effort */ }
// Detach the cert-validation handler (Driver.OpcUaClient-012).
if (_certValidationHandler is not null && _certValidatorRef is not null)
{
try { _certValidatorRef.CertificateValidation -= _certValidationHandler; } catch { }
_certValidationHandler = null;
_certValidatorRef = null;
}
// Drain the gate before disposal so no in-flight _gate.Release() fires after
// Dispose (Driver.OpcUaClient-007).
try
{
await _gate.WaitAsync(TimeSpan.FromSeconds(2)).ConfigureAwait(false);
_gate.Release();
}
catch { /* timeout or already disposed */ }
_gate.Dispose();
}
}