feat(dcl): OPC UA verify-endpoint probe with untrusted-cert capture (T17)

This commit is contained in:
Joseph Doherty
2026-06-18 03:00:55 -04:00
parent 90abb4b8e2
commit 733c7bf66c
5 changed files with 551 additions and 2 deletions
@@ -1,8 +1,12 @@
using Akka.Actor;
using Akka.Event;
using Microsoft.Extensions.Logging.Abstractions;
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Protocol;
using ZB.MOM.WW.ScadaBridge.Commons.Messages.DataConnection;
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Management;
using ZB.MOM.WW.ScadaBridge.Commons.Serialization;
using ZB.MOM.WW.ScadaBridge.Commons.Types.DataConnections;
using ZB.MOM.WW.ScadaBridge.DataConnectionLayer.Adapters;
using ZB.MOM.WW.ScadaBridge.HealthMonitoring;
using ZB.MOM.WW.ScadaBridge.SiteEventLogging;
@@ -20,6 +24,11 @@ public class DataConnectionManagerActor : ReceiveActor
private readonly DataConnectionOptions _options;
private readonly ISiteHealthCollector _healthCollector;
private readonly ISiteEventLogger? _siteEventLogger;
// T17: deployment-wide OPC UA application identity / cert-store paths — the same
// global options the DataConnectionFactory feeds to RealOpcUaClient when creating OPC
// UA connections. Needed by the verify-endpoint probe (VerifyEndpointCommand), which
// builds an ApplicationConfiguration directly rather than through a connection actor.
private readonly OpcUaGlobalOptions _opcUaGlobalOptions;
private readonly Dictionary<string, IActorRef> _connectionActors = new();
/// <summary>
@@ -29,16 +38,23 @@ public class DataConnectionManagerActor : ReceiveActor
/// <param name="options">Configuration options for data connections.</param>
/// <param name="healthCollector">Collector for site health metrics reported by connection actors.</param>
/// <param name="siteEventLogger">Optional logger for site event entries; null disables site event logging.</param>
/// <param name="opcUaGlobalOptions">
/// Deployment-wide OPC UA application identity / cert-store paths used by the
/// verify-endpoint probe; null falls back to defaults (mirrors
/// <see cref="DataConnectionFactory"/>'s default-options constructor).
/// </param>
public DataConnectionManagerActor(
IDataConnectionFactory factory,
DataConnectionOptions options,
ISiteHealthCollector healthCollector,
ISiteEventLogger? siteEventLogger = null)
ISiteEventLogger? siteEventLogger = null,
OpcUaGlobalOptions? opcUaGlobalOptions = null)
{
_factory = factory;
_options = options;
_healthCollector = healthCollector;
_siteEventLogger = siteEventLogger;
_opcUaGlobalOptions = opcUaGlobalOptions ?? new OpcUaGlobalOptions();
Receive<CreateConnectionCommand>(HandleCreateConnection);
Receive<SubscribeTagsRequest>(HandleRoute);
@@ -52,6 +68,7 @@ public class DataConnectionManagerActor : ReceiveActor
Receive<BrowseNodeCommand>(HandleBrowse);
Receive<SearchAddressSpaceCommand>(HandleSearch);
Receive<ReadTagValuesCommand>(HandleReadTagValues);
Receive<VerifyEndpointCommand>(HandleVerifyEndpoint);
}
private void HandleCreateConnection(CreateConnectionCommand command)
@@ -243,6 +260,46 @@ public class DataConnectionManagerActor : ReceiveActor
}
}
/// <summary>
/// T17: Handles a <see cref="VerifyEndpointCommand"/> from the Central UI's "Verify"
/// action — probes the endpoint config WITHOUT persisting it (connect → capture an
/// untrusted cert → disconnect) and pipes a structured <see cref="VerifyEndpointResult"/>
/// back to the sender. Verify does NOT require an existing connection (the config may be
/// brand-new and unsaved), so — unlike the routed browse/read handlers — it does not look
/// up a connection actor; it runs the probe directly. Only OPC UA is supported today.
/// </summary>
private void HandleVerifyEndpoint(VerifyEndpointCommand cmd)
{
if (!string.Equals(cmd.Protocol, "OpcUa", StringComparison.OrdinalIgnoreCase))
{
Sender.Tell(new VerifyEndpointResult(
false, VerifyFailureKind.ServerError,
"Verify is only supported for OPC UA connections.", null));
return;
}
OpcUaEndpointConfig config;
try
{
(config, _) = OpcUaEndpointConfigSerializer.Deserialize(cmd.ConfigJson);
}
catch (Exception ex)
{
// Defensive: Deserialize is designed not to throw (it classifies Malformed), but
// a verify must never crash the manager — surface the parse failure as ServerError.
_log.Warning(ex, "Verify config for {0} could not be parsed", cmd.ConnectionName);
Sender.Tell(new VerifyEndpointResult(
false, VerifyFailureKind.ServerError,
"The endpoint configuration could not be parsed.", null));
return;
}
var probeLogger = NullLogger.Instance;
RealOpcUaClient
.VerifyEndpointAsync(config, _opcUaGlobalOptions, probeLogger, TimeSpan.FromSeconds(6), CancellationToken.None)
.PipeTo(Sender);
}
private void HandleRemoveConnection(RemoveConnectionCommand command)
{
if (_connectionActors.TryGetValue(command.ConnectionName, out var actor))
@@ -5,7 +5,9 @@ using Microsoft.Extensions.Logging.Abstractions;
using Opc.Ua;
using Opc.Ua.Client;
using Opc.Ua.Configuration;
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Management;
using ZB.MOM.WW.ScadaBridge.Commons.Types.Alarms;
using ZB.MOM.WW.ScadaBridge.Commons.Types.DataConnections;
using ZB.MOM.WW.ScadaBridge.Commons.Types.Enums;
namespace ZB.MOM.WW.ScadaBridge.DataConnectionLayer.Adapters;
@@ -155,6 +157,230 @@ public class RealOpcUaClient : IOpcUaClient
await _subscription.CreateAsync(cancellationToken);
}
/// <summary>
/// T17: Probes an OPC UA endpoint configuration WITHOUT persisting it or creating a
/// long-lived connection — connect, capture the server certificate if it is untrusted,
/// then disconnect. The probe is secure-by-default and READ-ONLY: it forces
/// <c>AutoAcceptUntrustedCertificates = false</c> and a validation hook that captures an
/// untrusted server certificate then REJECTS it (<c>e.Accept = false</c>). It never trusts
/// the certificate — trusting is a separate, later operator action. The session is always
/// disposed in a <c>finally</c>.
/// </summary>
/// <param name="config">The endpoint configuration to probe.</param>
/// <param name="globalOptions">Deployment-wide OPC UA application identity / cert-store paths.</param>
/// <param name="logger">Logger for diagnostics.</param>
/// <param name="timeout">Wall-clock budget for the whole probe (discovery + session create).</param>
/// <param name="ct">External cancellation token, linked with the timeout.</param>
/// <returns>A structured <see cref="VerifyEndpointResult"/> classifying the outcome.</returns>
public static async Task<VerifyEndpointResult> VerifyEndpointAsync(
OpcUaEndpointConfig config,
OpcUaGlobalOptions globalOptions,
ILogger logger,
TimeSpan timeout,
CancellationToken ct)
{
// Captured by the certificate-validation hook below. A non-null value here means
// the server presented an untrusted certificate; it dominates the outcome mapping.
X509Certificate2? capturedCert = null;
ISession? session = null;
Exception? failure = null;
var endpointUrl = string.IsNullOrWhiteSpace(config.EndpointUrl)
? "opc.tcp://localhost:4840"
: config.EndpointUrl;
var preferredSecurityMode = config.SecurityMode switch
{
OpcUaSecurityMode.Sign => MessageSecurityMode.Sign,
OpcUaSecurityMode.SignAndEncrypt => MessageSecurityMode.SignAndEncrypt,
_ => MessageSecurityMode.None
};
// T17: secure-by-default — force AutoAccept=false so an untrusted server cert is
// captured and rejected rather than silently accepted (defeating the whole probe).
var appConfig = new ApplicationConfiguration
{
ApplicationName = string.IsNullOrWhiteSpace(globalOptions.ApplicationName)
? "ScadaBridge-DCL"
: globalOptions.ApplicationName,
ApplicationType = ApplicationType.Client,
SecurityConfiguration = new SecurityConfiguration
{
AutoAcceptUntrustedCertificates = false,
ApplicationCertificate = new CertificateIdentifier(),
TrustedIssuerCertificates = new CertificateTrustList { StorePath = ResolveStorePath(globalOptions.TrustedIssuerStorePath, "issuers") },
TrustedPeerCertificates = new CertificateTrustList { StorePath = ResolveStorePath(globalOptions.TrustedPeerStorePath, "trusted") },
RejectedCertificateStore = new CertificateTrustList { StorePath = ResolveStorePath(globalOptions.RejectedCertificateStorePath, "rejected") }
},
ClientConfiguration = new ClientConfiguration { DefaultSessionTimeout = config.SessionTimeoutMs },
TransportQuotas = new TransportQuotas { OperationTimeout = config.OperationTimeoutMs }
};
// T17: capture the untrusted server cert, then REJECT it (e.Accept = false). The
// validator runs on the SDK's connect thread; copying the cert is the only state we
// keep. Never accept — this probe must not trust anything.
appConfig.CertificateValidator.CertificateValidation += (_, e) =>
{
try
{
// Copy into a stable instance so disposing the SDK's chain doesn't invalidate it.
capturedCert = X509CertificateLoader.LoadCertificate(e.Certificate.RawData);
}
catch
{
// Best-effort capture: fall back to the original reference if the copy fails.
capturedCert = e.Certificate;
}
e.Accept = false;
};
using var linkedCts = CancellationTokenSource.CreateLinkedTokenSource(ct);
linkedCts.CancelAfter(timeout);
try
{
await appConfig.ValidateAsync(ApplicationType.Client);
// Discover endpoints, pick the preferred security mode (same logic as ConnectAsync).
EndpointDescription? endpoint;
try
{
#pragma warning disable CS0618
using var discoveryClient = DiscoveryClient.Create(new Uri(endpointUrl));
var endpoints = discoveryClient.GetEndpoints(null);
#pragma warning restore CS0618
endpoint = endpoints
.Where(ep => ep.SecurityMode == preferredSecurityMode)
.FirstOrDefault() ?? endpoints.FirstOrDefault();
}
catch
{
endpoint = new EndpointDescription(endpointUrl);
}
var endpointConfig = EndpointConfiguration.Create(appConfig);
var configuredEndpoint = new ConfiguredEndpoint(null, endpoint, endpointConfig);
#pragma warning disable CS0618 // Allow obsolete DefaultSessionFactory constructor for compatibility
var sessionFactory = new DefaultSessionFactory();
#pragma warning restore CS0618
var userIdentity = BuildUserIdentity(config.UserIdentity is { } ui
? new OpcUaUserIdentityOptions(
ui.TokenType.ToString(), ui.Username, ui.Password,
ui.CertificatePath, ui.CertificatePassword)
: null);
session = await sessionFactory.CreateAsync(
appConfig, configuredEndpoint, false,
"ScadaBridge-DCL-Verify", (uint)config.SessionTimeoutMs,
userIdentity, null, linkedCts.Token);
}
catch (Exception ex)
{
// OperationCanceledException from the linked CTS firing on timeout is mapped to
// VerifyFailureKind.Timeout inside MapVerifyOutcome.
failure = ex;
logger.LogDebug(ex, "OPC UA verify of {Endpoint} failed.", endpointUrl);
}
finally
{
// T17: ALWAYS dispose the probe session — never leave a connection open.
if (session != null)
{
try { await session.CloseAsync(CancellationToken.None); }
catch (Exception ex) { logger.LogDebug(ex, "OPC UA verify session close failed (ignored)."); }
session.Dispose();
}
}
return MapVerifyOutcome(failure, capturedCert);
}
/// <summary>
/// T17: Pure mapping of a probe outcome — an optional exception plus an optionally
/// captured untrusted server certificate — to a <see cref="VerifyEndpointResult"/>.
/// Factored out so the classification is unit-testable WITHOUT a live OPC UA server.
/// Precedence: a captured certificate ALWAYS yields
/// <see cref="VerifyFailureKind.UntrustedCertificate"/>; otherwise the exception is
/// classified; null exception + null cert means the session was created (success).
/// </summary>
/// <param name="failure">The exception thrown during the probe, or null on success.</param>
/// <param name="capturedCert">The untrusted server certificate captured by the validation hook, or null.</param>
/// <returns>The classified verification result.</returns>
internal static VerifyEndpointResult MapVerifyOutcome(Exception? failure, X509Certificate2? capturedCert)
{
// An untrusted server certificate dominates — regardless of how the connect failed,
// this is the actionable case (the operator may choose to trust it later).
if (capturedCert != null)
{
var info = new ServerCertInfo(
capturedCert.Thumbprint,
capturedCert.Subject,
capturedCert.Issuer,
capturedCert.NotBefore.ToUniversalTime(),
capturedCert.NotAfter.ToUniversalTime(),
Convert.ToBase64String(capturedCert.RawData));
return new VerifyEndpointResult(
false, VerifyFailureKind.UntrustedCertificate,
"The server certificate is not trusted by this site.", info);
}
if (failure is null)
return new VerifyEndpointResult(true, null, null, null);
// Timeout / cancellation (the linked CTS fired, or the SDK reported a request timeout).
if (failure is TimeoutException or OperationCanceledException)
return new VerifyEndpointResult(false, VerifyFailureKind.Timeout, failure.Message, null);
if (failure is ServiceResultException sre)
{
// A socket cause wrapped inside the SDK exception means the host is unreachable.
if (HasSocketCause(sre))
return new VerifyEndpointResult(false, VerifyFailureKind.Unreachable, sre.Message, null);
switch (sre.StatusCode)
{
case StatusCodes.BadRequestTimeout:
case StatusCodes.BadTimeout:
return new VerifyEndpointResult(false, VerifyFailureKind.Timeout, sre.Message, null);
case StatusCodes.BadUserAccessDenied:
case StatusCodes.BadIdentityTokenRejected:
case StatusCodes.BadIdentityTokenInvalid:
return new VerifyEndpointResult(false, VerifyFailureKind.AuthFailed, sre.Message, null);
case StatusCodes.BadConnectionRejected:
case StatusCodes.BadNotConnected:
case StatusCodes.BadConnectionClosed:
case StatusCodes.BadNoCommunication:
case StatusCodes.BadServerNotConnected:
return new VerifyEndpointResult(false, VerifyFailureKind.Unreachable, sre.Message, null);
default:
return new VerifyEndpointResult(false, VerifyFailureKind.ServerError, sre.Message, null);
}
}
// A bare socket failure (DNS / connection refused) before the SDK wrapped it.
if (HasSocketCause(failure))
return new VerifyEndpointResult(false, VerifyFailureKind.Unreachable, failure.Message, null);
return new VerifyEndpointResult(false, VerifyFailureKind.ServerError, failure.Message, null);
}
/// <summary>
/// Walks the exception's <c>InnerException</c> chain looking for a
/// <see cref="System.Net.Sockets.SocketException"/> — the signature of a DNS-resolution
/// or connection-refused failure that means the endpoint host is unreachable.
/// </summary>
private static bool HasSocketCause(Exception ex)
{
for (var cur = ex; cur != null; cur = cur.InnerException)
{
if (cur is System.Net.Sockets.SocketException)
return true;
}
return false;
}
/// <inheritdoc />
public async Task DisconnectAsync(CancellationToken cancellationToken = default)
{