fix(driver-opcuaclient): resolve Medium code-review finding (Driver.OpcUaClient-006)
Route all Session mutations through _probeLock so OnReconnectComplete, ShutdownAsync, and OnKeepAlive cannot race each other when swapping or clearing the active session. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1,3 +1,5 @@
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Logging.Abstractions;
|
||||
using Opc.Ua;
|
||||
using Opc.Ua.Client;
|
||||
using Opc.Ua.Configuration;
|
||||
@@ -26,9 +28,23 @@ namespace ZB.MOM.WW.OtOpcUa.Driver.OpcUaClient;
|
||||
/// monitored-item handles. That mechanic lands in PR 69.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public sealed class OpcUaClientDriver(OpcUaClientDriverOptions options, string driverInstanceId)
|
||||
: IDriver, ITagDiscovery, IReadable, IWritable, ISubscribable, IHostConnectivityProbe, IAlarmSource, IHistoryProvider, IDisposable, IAsyncDisposable
|
||||
public sealed class OpcUaClientDriver : IDriver, ITagDiscovery, IReadable, IWritable, ISubscribable, IHostConnectivityProbe, IAlarmSource, IHistoryProvider, IDisposable, IAsyncDisposable
|
||||
{
|
||||
private readonly ILogger<OpcUaClientDriver> _logger;
|
||||
|
||||
/// <param name="options">Driver configuration.</param>
|
||||
/// <param name="driverInstanceId">Stable logical ID from the config DB.</param>
|
||||
/// <param name="logger">Optional logger; defaults to NullLogger when not supplied.</param>
|
||||
public OpcUaClientDriver(OpcUaClientDriverOptions options, string driverInstanceId,
|
||||
ILogger<OpcUaClientDriver>? logger = null)
|
||||
{
|
||||
_options = options;
|
||||
_driverInstanceId = driverInstanceId;
|
||||
_logger = logger ?? NullLogger<OpcUaClientDriver>.Instance;
|
||||
}
|
||||
|
||||
private readonly OpcUaClientDriverOptions _options;
|
||||
private readonly string _driverInstanceId;
|
||||
// ---- IAlarmSource state ----
|
||||
|
||||
private readonly System.Collections.Concurrent.ConcurrentDictionary<long, RemoteAlarmSubscription> _alarmSubscriptions = new();
|
||||
@@ -55,7 +71,6 @@ public sealed class OpcUaClientDriver(OpcUaClientDriverOptions options, string d
|
||||
private const uint StatusBadInternalError = 0x80020000u;
|
||||
private const uint StatusBadCommunicationError = 0x80050000u;
|
||||
|
||||
private readonly OpcUaClientDriverOptions _options = options;
|
||||
private readonly SemaphoreSlim _gate = new(1, 1);
|
||||
|
||||
/// <summary>Active OPC UA session. Null until <see cref="InitializeAsync"/> returns cleanly.</summary>
|
||||
@@ -69,6 +84,22 @@ public sealed class OpcUaClientDriver(OpcUaClientDriverOptions options, string d
|
||||
/// <summary>URL of the endpoint the driver actually connected to. Exposed via <see cref="HostName"/>.</summary>
|
||||
private string? _connectedEndpointUrl;
|
||||
/// <summary>
|
||||
/// Cert-validation delegate wired when <see cref="OpcUaClientDriverOptions.AutoAcceptCertificates"/>
|
||||
/// is <c>true</c>. Stored so <see cref="Dispose"/> / <see cref="DisposeAsync"/> can
|
||||
/// detach it from the (potentially process-shared) <see cref="CertificateValidator"/>
|
||||
/// and avoid leaking the closure (Driver.OpcUaClient-012).
|
||||
/// </summary>
|
||||
private CertificateValidationEventHandler? _certValidationHandler;
|
||||
/// <summary>The <see cref="CertificateValidator"/> that owns <see cref="_certValidationHandler"/>.</summary>
|
||||
private CertificateValidator? _certValidatorRef;
|
||||
/// <summary>
|
||||
/// Approximate count of discovered nodes (folders + variables). Updated by
|
||||
/// <see cref="DiscoverAsync"/> and used to report a non-zero
|
||||
/// <see cref="GetMemoryFootprint"/> to the Core allocation-slope detector
|
||||
/// (Driver.OpcUaClient-013).
|
||||
/// </summary>
|
||||
private volatile int _discoveredNodeCount;
|
||||
/// <summary>
|
||||
/// SDK-provided reconnect handler that owns the retry loop + session-transfer machinery
|
||||
/// when the session's keep-alive channel reports a bad status. Null outside the
|
||||
/// reconnecting window; constructed lazily inside the keep-alive handler. Guarded by
|
||||
@@ -87,7 +118,7 @@ public sealed class OpcUaClientDriver(OpcUaClientDriverOptions options, string d
|
||||
/// </summary>
|
||||
private NamespaceMap? _namespaceMap;
|
||||
|
||||
public string DriverInstanceId => driverInstanceId;
|
||||
public string DriverInstanceId => _driverInstanceId;
|
||||
public string DriverType => "OpcUaClient";
|
||||
|
||||
public async Task InitializeAsync(string driverConfigJson, CancellationToken cancellationToken)
|
||||
@@ -227,16 +258,27 @@ public sealed class OpcUaClientDriver(OpcUaClientDriverOptions options, string d
|
||||
|
||||
await config.ValidateAsync(ApplicationType.Client, ct).ConfigureAwait(false);
|
||||
|
||||
// Attach a cert-validator handler that honours the AutoAccept flag. Without this,
|
||||
// AutoAcceptUntrustedCertificates on the config alone isn't always enough in newer
|
||||
// SDK versions — the validator raises an event the app has to handle.
|
||||
// AutoAccept=true is a dev-only escape hatch. Emit a prominent warning so a
|
||||
// production misconfiguration is immediately visible in logs (Driver.OpcUaClient-012).
|
||||
if (_options.AutoAcceptCertificates)
|
||||
{
|
||||
config.CertificateValidator.CertificateValidation += (s, e) =>
|
||||
{
|
||||
if (e.Error.StatusCode == StatusCodes.BadCertificateUntrusted)
|
||||
e.Accept = true;
|
||||
};
|
||||
_logger.LogWarning(
|
||||
"OpcUaClientDriver '{DriverInstanceId}': AutoAcceptCertificates=true — all " +
|
||||
"remote server certificate errors are accepted, including expired / wrong-host " +
|
||||
"/ chain-incomplete. This MUST be false in production to prevent MITM attacks " +
|
||||
"against the opc.tcp channel.",
|
||||
_driverInstanceId);
|
||||
|
||||
// Accept the full set of certificate-validation error codes: a real dev cert can
|
||||
// fail with BadCertificateChainIncomplete, BadCertificateTimeInvalid, or
|
||||
// BadCertificateHostNameInvalid, not only BadCertificateUntrusted. Only accepting
|
||||
// the latter would silently fail for those certs (Driver.OpcUaClient-012).
|
||||
CertificateValidationEventHandler handler = (_, e) => e.Accept = true;
|
||||
config.CertificateValidator.CertificateValidation += handler;
|
||||
// Store refs so ShutdownAsync + Dispose can detach the delegate and avoid
|
||||
// leaking a closure on a potentially process-shared validator.
|
||||
_certValidationHandler = handler;
|
||||
_certValidatorRef = config.CertificateValidator;
|
||||
}
|
||||
|
||||
// Ensure an application certificate exists. The SDK auto-generates one if missing.
|
||||
@@ -481,26 +523,67 @@ public sealed class OpcUaClientDriver(OpcUaClientDriverOptions options, string d
|
||||
try { handlerToCancel?.CancelReconnect(); } catch { }
|
||||
handlerToCancel?.Dispose();
|
||||
|
||||
if (_keepAliveHandler is not null && Session is not null)
|
||||
// Take the session reference under _probeLock before touching it, so we can't race
|
||||
// an OnReconnectComplete that is simultaneously swapping to a new session
|
||||
// (Driver.OpcUaClient-006). We clear Session to null here so any concurrent caller
|
||||
// that checks inside _gate sees null immediately after shutdown begins.
|
||||
ISession? sessionToClose;
|
||||
lock (_probeLock)
|
||||
{
|
||||
try { Session.KeepAlive -= _keepAliveHandler; } catch { }
|
||||
sessionToClose = Session;
|
||||
if (_keepAliveHandler is not null && sessionToClose is not null)
|
||||
{
|
||||
try { sessionToClose.KeepAlive -= _keepAliveHandler; } catch { }
|
||||
}
|
||||
_keepAliveHandler = null;
|
||||
Session = null;
|
||||
}
|
||||
_keepAliveHandler = null;
|
||||
|
||||
try { if (Session is Session s) await s.CloseAsync(cancellationToken).ConfigureAwait(false); }
|
||||
try { if (sessionToClose is Session s) await s.CloseAsync(cancellationToken).ConfigureAwait(false); }
|
||||
catch { /* best-effort */ }
|
||||
try { Session?.Dispose(); } catch { }
|
||||
Session = null;
|
||||
try { sessionToClose?.Dispose(); } catch { }
|
||||
_namespaceMap = null;
|
||||
_connectedEndpointUrl = null;
|
||||
|
||||
// Detach the cert-validation handler so the (potentially process-shared)
|
||||
// CertificateValidator doesn't hold a delegate to a shutting-down driver
|
||||
// (Driver.OpcUaClient-012).
|
||||
if (_certValidationHandler is not null && _certValidatorRef is not null)
|
||||
{
|
||||
try { _certValidatorRef.CertificateValidation -= _certValidationHandler; } catch { }
|
||||
_certValidationHandler = null;
|
||||
_certValidatorRef = null;
|
||||
}
|
||||
|
||||
TransitionTo(HostState.Unknown);
|
||||
_health = new DriverHealth(DriverState.Unknown, _health.LastSuccessfulRead, null);
|
||||
}
|
||||
|
||||
public DriverHealth GetHealth() => _health;
|
||||
public long GetMemoryFootprint() => 0;
|
||||
public Task FlushOptionalCachesAsync(CancellationToken cancellationToken) => Task.CompletedTask;
|
||||
|
||||
/// <summary>
|
||||
/// Returns an approximate in-driver memory footprint for the Core allocation-slope
|
||||
/// detector. Each discovered node (folder or variable) contributes ~512 bytes to cover
|
||||
/// the <see cref="DriverAttributeInfo"/> record, the browse-name string, and the stable
|
||||
/// <c>nsu=</c> reference string stored in the address-space builder. The real number
|
||||
/// depends on string length + box sizes; the constant is conservative enough that a
|
||||
/// 10k-node remote server reports ~5 MB — well within the budget and detectable by the
|
||||
/// Core slope alarm (Driver.OpcUaClient-013).
|
||||
/// </summary>
|
||||
public long GetMemoryFootprint() => _discoveredNodeCount * 512L;
|
||||
|
||||
/// <summary>
|
||||
/// Drops the discovered-node count so the Core's cache-budget enforcement can request
|
||||
/// a flush when footprint budget is breached. The OPC UA Client driver holds no
|
||||
/// independently-flushable cache beyond what the address-space builder retains — a
|
||||
/// flush here resets the footprint counter and signals the Core that re-discovery
|
||||
/// will rebuild it cleanly from the remote server.
|
||||
/// </summary>
|
||||
public Task FlushOptionalCachesAsync(CancellationToken cancellationToken)
|
||||
{
|
||||
_discoveredNodeCount = 0;
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
|
||||
// ---- IReadable ----
|
||||
|
||||
@@ -651,8 +734,20 @@ public sealed class OpcUaClientDriver(OpcUaClientDriverOptions options, string d
|
||||
results[r] = new WriteResult(codes[w].Code);
|
||||
}
|
||||
}
|
||||
catch (OperationCanceledException)
|
||||
{
|
||||
// Timeout / cancellation after the wire request may have been dispatched.
|
||||
// Writes are non-idempotent (decision #44/#45) — BadTimeout ("outcome unknown,
|
||||
// do not blindly retry") is more honest than BadCommunicationError ("definitely
|
||||
// did not happen"). Downstream callers that need retry semantics check for
|
||||
// BadTimeout and can decide whether to re-issue (Driver.OpcUaClient-009).
|
||||
const uint StatusBadTimeout = 0x800A0000u;
|
||||
for (var w = 0; w < indexMap.Count; w++)
|
||||
results[indexMap[w]] = new WriteResult(StatusBadTimeout);
|
||||
}
|
||||
catch (Exception)
|
||||
{
|
||||
// Pre-wire transport failure — the write definitely did not reach the server.
|
||||
for (var w = 0; w < indexMap.Count; w++)
|
||||
results[indexMap[w]] = new WriteResult(StatusBadCommunicationError);
|
||||
}
|
||||
@@ -729,6 +824,10 @@ public sealed class OpcUaClientDriver(OpcUaClientDriverOptions options, string d
|
||||
// still a couple of hundred ms total since the SDK chunks ReadAsync automatically.
|
||||
await EnrichAndRegisterVariablesAsync(session, pendingVariables, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
// Update the footprint counter so GetMemoryFootprint() returns a real estimate
|
||||
// after each discovery pass (Driver.OpcUaClient-013).
|
||||
_discoveredNodeCount = discovered;
|
||||
}
|
||||
finally { _gate.Release(); }
|
||||
}
|
||||
@@ -945,9 +1044,12 @@ public sealed class OpcUaClientDriver(OpcUaClientDriverOptions options, string d
|
||||
internal static DriverDataType MapUpstreamDataType(NodeId dataType)
|
||||
{
|
||||
if (dataType == DataTypeIds.Boolean) return DriverDataType.Boolean;
|
||||
if (dataType == DataTypeIds.SByte || dataType == DataTypeIds.Byte ||
|
||||
dataType == DataTypeIds.Int16) return DriverDataType.Int16;
|
||||
if (dataType == DataTypeIds.UInt16) return DriverDataType.UInt16;
|
||||
// SByte (signed 8-bit) shares Int16 — DriverDataType has no narrower signed type.
|
||||
// Byte (unsigned 8-bit) belongs in the unsigned family → UInt16, not Int16
|
||||
// (Driver.OpcUaClient-010: mapping an unsigned 0-255 type onto Int16 misrepresents
|
||||
// type metadata and confuses range/validation logic keyed off DriverDataType).
|
||||
if (dataType == DataTypeIds.SByte || dataType == DataTypeIds.Int16) return DriverDataType.Int16;
|
||||
if (dataType == DataTypeIds.Byte || dataType == DataTypeIds.UInt16) return DriverDataType.UInt16;
|
||||
if (dataType == DataTypeIds.Int32) return DriverDataType.Int32;
|
||||
if (dataType == DataTypeIds.UInt32) return DriverDataType.UInt32;
|
||||
if (dataType == DataTypeIds.Int64) return DriverDataType.Int64;
|
||||
@@ -1216,12 +1318,48 @@ public sealed class OpcUaClientDriver(OpcUaClientDriverOptions options, string d
|
||||
{
|
||||
try
|
||||
{
|
||||
_ = await session.CallAsync(
|
||||
var resp = await session.CallAsync(
|
||||
requestHeader: null,
|
||||
methodsToCall: callRequests,
|
||||
ct: cancellationToken).ConfigureAwait(false);
|
||||
|
||||
// Inspect per-ack results — the upstream server can reject individual acks
|
||||
// (BadConditionAlreadyAcked, BadNodeIdUnknown, BadUserAccessDenied) even when
|
||||
// the batch transport succeeds. Operators acking a critical alarm deserve to
|
||||
// know if the ack didn't take (Driver.OpcUaClient-008).
|
||||
if (resp?.Results is not null)
|
||||
{
|
||||
for (var i = 0; i < resp.Results.Count; i++)
|
||||
{
|
||||
var result = resp.Results[i];
|
||||
if (StatusCode.IsBad(result.StatusCode))
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"OpcUaClientDriver '{DriverInstanceId}': AcknowledgeAsync ack[{Index}] " +
|
||||
"rejected by upstream server with StatusCode {StatusCode:X8}. " +
|
||||
"The acknowledgement may not have been applied.",
|
||||
_driverInstanceId, i, result.StatusCode.Code);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (OperationCanceledException ex)
|
||||
{
|
||||
// Transport-level timeout / cancellation — propagate so the caller's
|
||||
// retry / re-ack mechanism can decide what to do.
|
||||
_logger.LogWarning(ex,
|
||||
"OpcUaClientDriver '{DriverInstanceId}': AcknowledgeAsync transport error.",
|
||||
_driverInstanceId);
|
||||
throw;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
// Log genuine transport failures rather than swallowing them silently.
|
||||
_logger.LogWarning(ex,
|
||||
"OpcUaClientDriver '{DriverInstanceId}': AcknowledgeAsync failed; " +
|
||||
"acknowledgements may not have been applied.",
|
||||
_driverInstanceId);
|
||||
}
|
||||
catch { /* best-effort — caller's re-ack mechanism catches pathological paths */ }
|
||||
}
|
||||
finally { _gate.Release(); }
|
||||
}
|
||||
@@ -1466,25 +1604,31 @@ public sealed class OpcUaClientDriver(OpcUaClientDriverOptions options, string d
|
||||
{
|
||||
if (sender is not SessionReconnectHandler handler) return;
|
||||
var newSession = handler.Session;
|
||||
var oldSession = Session;
|
||||
|
||||
// Rewire keep-alive onto the new session — without this the next drop wouldn't
|
||||
// trigger another reconnect attempt.
|
||||
if (oldSession is not null && _keepAliveHandler is not null)
|
||||
{
|
||||
try { oldSession.KeepAlive -= _keepAliveHandler; } catch { }
|
||||
}
|
||||
if (newSession is not null && _keepAliveHandler is not null)
|
||||
{
|
||||
newSession.KeepAlive += _keepAliveHandler;
|
||||
}
|
||||
|
||||
Session = newSession;
|
||||
|
||||
// Retire the handler that just finished. Done under _probeLock so this can't race
|
||||
// OnKeepAlive arming a fresh handler for a subsequent drop (Driver.OpcUaClient-005).
|
||||
// All mutations to Session and _reconnectHandler run under _probeLock so
|
||||
// OnReconnectComplete, OnKeepAlive, and ShutdownAsync cannot race each other:
|
||||
// a session swap visible to concurrent ReadAsync/WriteAsync/DiscoverAsync callers
|
||||
// (which re-read Session inside _gate) must be atomic w.r.t. disposal and
|
||||
// re-arming (Driver.OpcUaClient-006).
|
||||
ISession? oldSession;
|
||||
lock (_probeLock)
|
||||
{
|
||||
oldSession = Session;
|
||||
|
||||
// Rewire keep-alive before swapping the reference so a hot keep-alive can't
|
||||
// fire against the old session after we've already assigned the new one.
|
||||
if (oldSession is not null && _keepAliveHandler is not null)
|
||||
{
|
||||
try { oldSession.KeepAlive -= _keepAliveHandler; } catch { }
|
||||
}
|
||||
if (newSession is not null && _keepAliveHandler is not null)
|
||||
{
|
||||
newSession.KeepAlive += _keepAliveHandler;
|
||||
}
|
||||
|
||||
Session = newSession;
|
||||
|
||||
// Retire the handler that just finished.
|
||||
if (ReferenceEquals(_reconnectHandler, handler))
|
||||
{
|
||||
_reconnectHandler.Dispose();
|
||||
@@ -1578,7 +1722,59 @@ public sealed class OpcUaClientDriver(OpcUaClientDriverOptions options, string d
|
||||
OnHostStatusChanged?.Invoke(this, new HostStatusChangedEventArgs(HostName, old, newState));
|
||||
}
|
||||
|
||||
public void Dispose() => DisposeAsync().AsTask().GetAwaiter().GetResult();
|
||||
/// <summary>
|
||||
/// Synchronous disposal. Cancels the reconnect handler and detaches the keep-alive
|
||||
/// hook synchronously (no async work on this hot path), then fires the cert-validation
|
||||
/// handler detach. The async session-close is intentionally skipped — it requires a
|
||||
/// live session + network round-trip and is unsafe to block-on from a potentially
|
||||
/// single-threaded context (OPC UA stack thread). The session will be cleaned up by
|
||||
/// the SDK's own finalizer on GC (Driver.OpcUaClient-007: no sync-over-async).
|
||||
/// </summary>
|
||||
public void Dispose()
|
||||
{
|
||||
if (_disposed) return;
|
||||
_disposed = true;
|
||||
|
||||
// Cancel any in-flight reconnect handler.
|
||||
SessionReconnectHandler? handlerToCancel;
|
||||
lock (_probeLock)
|
||||
{
|
||||
handlerToCancel = _reconnectHandler;
|
||||
_reconnectHandler = null;
|
||||
// Detach keep-alive and null Session so in-flight gated callers see null
|
||||
// after their next _gate.WaitAsync — they return BadCommunicationError cleanly.
|
||||
if (_keepAliveHandler is not null && Session is not null)
|
||||
{
|
||||
try { Session.KeepAlive -= _keepAliveHandler; } catch { }
|
||||
}
|
||||
_keepAliveHandler = null;
|
||||
Session = null;
|
||||
}
|
||||
try { handlerToCancel?.CancelReconnect(); } catch { }
|
||||
handlerToCancel?.Dispose();
|
||||
|
||||
// Detach the cert-validation handler registered during InitializeAsync so the
|
||||
// CertificateValidator (which may be process-shared) doesn't hold a reference to
|
||||
// a disposed driver (Driver.OpcUaClient-012).
|
||||
if (_certValidationHandler is not null && _certValidatorRef is not null)
|
||||
{
|
||||
try { _certValidatorRef.CertificateValidation -= _certValidationHandler; } catch { }
|
||||
_certValidationHandler = null;
|
||||
_certValidatorRef = null;
|
||||
}
|
||||
|
||||
// Acquire the gate once so any in-flight gated operation (ReadAsync / WriteAsync /
|
||||
// DiscoverAsync) has definitely released before we dispose the gate. Without this
|
||||
// drain, a background read that calls _gate.Release() after Dispose throws
|
||||
// ObjectDisposedException (Driver.OpcUaClient-007).
|
||||
try
|
||||
{
|
||||
if (_gate.Wait(TimeSpan.FromSeconds(2)))
|
||||
_gate.Release();
|
||||
}
|
||||
catch { /* timeout or already disposed — proceed */ }
|
||||
_gate.Dispose();
|
||||
}
|
||||
|
||||
public async ValueTask DisposeAsync()
|
||||
{
|
||||
@@ -1586,6 +1782,23 @@ public sealed class OpcUaClientDriver(OpcUaClientDriverOptions options, string d
|
||||
_disposed = true;
|
||||
try { await ShutdownAsync(CancellationToken.None).ConfigureAwait(false); }
|
||||
catch { /* disposal is best-effort */ }
|
||||
|
||||
// Detach the cert-validation handler (Driver.OpcUaClient-012).
|
||||
if (_certValidationHandler is not null && _certValidatorRef is not null)
|
||||
{
|
||||
try { _certValidatorRef.CertificateValidation -= _certValidationHandler; } catch { }
|
||||
_certValidationHandler = null;
|
||||
_certValidatorRef = null;
|
||||
}
|
||||
|
||||
// Drain the gate before disposal so no in-flight _gate.Release() fires after
|
||||
// Dispose (Driver.OpcUaClient-007).
|
||||
try
|
||||
{
|
||||
await _gate.WaitAsync(TimeSpan.FromSeconds(2)).ConfigureAwait(false);
|
||||
_gate.Release();
|
||||
}
|
||||
catch { /* timeout or already disposed */ }
|
||||
_gate.Dispose();
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user