feat(dcl): OPC UA verify-endpoint probe with untrusted-cert capture (T17)

This commit is contained in:
Joseph Doherty
2026-06-18 03:00:55 -04:00
parent 90abb4b8e2
commit 733c7bf66c
5 changed files with 551 additions and 2 deletions
@@ -0,0 +1,80 @@
namespace ZB.MOM.WW.ScadaBridge.Commons.Messages.Management;
/// <summary>
/// Command (Central UI → site DCL manager) to verify a data-connection endpoint
/// configuration without persisting it: connect, capture the server certificate if it
/// is untrusted, then disconnect. The probe is read-only and never trusts the server
/// certificate — an untrusted certificate is captured and the connect is rejected
/// (cert trust is a separate, later action). Only OPC UA is supported today; other
/// protocols return a <see cref="VerifyEndpointResult"/> with
/// <see cref="VerifyFailureKind.ServerError"/>.
/// </summary>
/// <param name="ConnectionName">Name of the data connection being verified (for logging/correlation).</param>
/// <param name="Protocol">Protocol type string (e.g. "OpcUa"); matched case-insensitively.</param>
/// <param name="ConfigJson">Serialized endpoint configuration JSON (the typed OPC UA endpoint shape).</param>
public record VerifyEndpointCommand(string ConnectionName, string Protocol, string ConfigJson);
/// <summary>
/// Classification of why an endpoint verification failed. Distinguishes the cases the
/// Central UI must present differently — most importantly
/// <see cref="UntrustedCertificate"/>, which carries a capturable server certificate the
/// operator can choose to trust in a later step.
/// </summary>
public enum VerifyFailureKind
{
/// <summary>The endpoint host could not be reached (DNS failure, connection refused, socket error).</summary>
Unreachable,
/// <summary>The server rejected the supplied user identity (anonymous/username/certificate).</summary>
AuthFailed,
/// <summary>
/// The server presented a certificate that is not trusted by this site. The
/// certificate is captured in <see cref="VerifyEndpointResult.Cert"/> so it can be
/// reviewed and trusted in a later step; the probe itself rejected it.
/// </summary>
UntrustedCertificate,
/// <summary>The verification did not complete within the allotted time budget.</summary>
Timeout,
/// <summary>Any other server-side or unexpected failure (including unsupported protocol).</summary>
ServerError
}
/// <summary>
/// Details of a server certificate captured during a verification probe. Carries the
/// fields the Central UI needs to display the certificate for an operator trust decision,
/// plus the raw DER (base64) so the certificate can be persisted to the trusted store
/// verbatim in a later step.
/// </summary>
/// <param name="Thumbprint">The certificate SHA-1 thumbprint (hex).</param>
/// <param name="Subject">The certificate subject distinguished name.</param>
/// <param name="Issuer">The certificate issuer distinguished name.</param>
/// <param name="NotBeforeUtc">The not-before validity bound (UTC).</param>
/// <param name="NotAfterUtc">The not-after validity bound (UTC).</param>
/// <param name="DerBase64">The raw DER-encoded certificate, base64-encoded.</param>
public record ServerCertInfo(
string Thumbprint,
string Subject,
string Issuer,
DateTime NotBeforeUtc,
DateTime NotAfterUtc,
string DerBase64);
/// <summary>
/// Result of a <see cref="VerifyEndpointCommand"/>. On success <see cref="FailureKind"/>,
/// <see cref="Error"/>, and <see cref="Cert"/> are all null. On failure
/// <see cref="FailureKind"/> classifies the failure and <see cref="Error"/> carries a
/// human-readable message; <see cref="Cert"/> is populated only when
/// <see cref="FailureKind"/> is <see cref="VerifyFailureKind.UntrustedCertificate"/>.
/// </summary>
/// <param name="Success">True if a session was established (the endpoint config is valid and reachable).</param>
/// <param name="FailureKind">The failure classification, or null on success.</param>
/// <param name="Error">A human-readable error message, or null on success.</param>
/// <param name="Cert">The captured untrusted server certificate, or null.</param>
public record VerifyEndpointResult(
bool Success,
VerifyFailureKind? FailureKind,
string? Error,
ServerCertInfo? Cert);
@@ -1,8 +1,12 @@
using Akka.Actor;
using Akka.Event;
using Microsoft.Extensions.Logging.Abstractions;
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Protocol;
using ZB.MOM.WW.ScadaBridge.Commons.Messages.DataConnection;
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Management;
using ZB.MOM.WW.ScadaBridge.Commons.Serialization;
using ZB.MOM.WW.ScadaBridge.Commons.Types.DataConnections;
using ZB.MOM.WW.ScadaBridge.DataConnectionLayer.Adapters;
using ZB.MOM.WW.ScadaBridge.HealthMonitoring;
using ZB.MOM.WW.ScadaBridge.SiteEventLogging;
@@ -20,6 +24,11 @@ public class DataConnectionManagerActor : ReceiveActor
private readonly DataConnectionOptions _options;
private readonly ISiteHealthCollector _healthCollector;
private readonly ISiteEventLogger? _siteEventLogger;
// T17: deployment-wide OPC UA application identity / cert-store paths — the same
// global options the DataConnectionFactory feeds to RealOpcUaClient when creating OPC
// UA connections. Needed by the verify-endpoint probe (VerifyEndpointCommand), which
// builds an ApplicationConfiguration directly rather than through a connection actor.
private readonly OpcUaGlobalOptions _opcUaGlobalOptions;
private readonly Dictionary<string, IActorRef> _connectionActors = new();
/// <summary>
@@ -29,16 +38,23 @@ public class DataConnectionManagerActor : ReceiveActor
/// <param name="options">Configuration options for data connections.</param>
/// <param name="healthCollector">Collector for site health metrics reported by connection actors.</param>
/// <param name="siteEventLogger">Optional logger for site event entries; null disables site event logging.</param>
/// <param name="opcUaGlobalOptions">
/// Deployment-wide OPC UA application identity / cert-store paths used by the
/// verify-endpoint probe; null falls back to defaults (mirrors
/// <see cref="DataConnectionFactory"/>'s default-options constructor).
/// </param>
public DataConnectionManagerActor(
IDataConnectionFactory factory,
DataConnectionOptions options,
ISiteHealthCollector healthCollector,
ISiteEventLogger? siteEventLogger = null)
ISiteEventLogger? siteEventLogger = null,
OpcUaGlobalOptions? opcUaGlobalOptions = null)
{
_factory = factory;
_options = options;
_healthCollector = healthCollector;
_siteEventLogger = siteEventLogger;
_opcUaGlobalOptions = opcUaGlobalOptions ?? new OpcUaGlobalOptions();
Receive<CreateConnectionCommand>(HandleCreateConnection);
Receive<SubscribeTagsRequest>(HandleRoute);
@@ -52,6 +68,7 @@ public class DataConnectionManagerActor : ReceiveActor
Receive<BrowseNodeCommand>(HandleBrowse);
Receive<SearchAddressSpaceCommand>(HandleSearch);
Receive<ReadTagValuesCommand>(HandleReadTagValues);
Receive<VerifyEndpointCommand>(HandleVerifyEndpoint);
}
private void HandleCreateConnection(CreateConnectionCommand command)
@@ -243,6 +260,46 @@ public class DataConnectionManagerActor : ReceiveActor
}
}
/// <summary>
/// T17: Handles a <see cref="VerifyEndpointCommand"/> from the Central UI's "Verify"
/// action — probes the endpoint config WITHOUT persisting it (connect → capture an
/// untrusted cert → disconnect) and pipes a structured <see cref="VerifyEndpointResult"/>
/// back to the sender. Verify does NOT require an existing connection (the config may be
/// brand-new and unsaved), so — unlike the routed browse/read handlers — it does not look
/// up a connection actor; it runs the probe directly. Only OPC UA is supported today.
/// </summary>
private void HandleVerifyEndpoint(VerifyEndpointCommand cmd)
{
if (!string.Equals(cmd.Protocol, "OpcUa", StringComparison.OrdinalIgnoreCase))
{
Sender.Tell(new VerifyEndpointResult(
false, VerifyFailureKind.ServerError,
"Verify is only supported for OPC UA connections.", null));
return;
}
OpcUaEndpointConfig config;
try
{
(config, _) = OpcUaEndpointConfigSerializer.Deserialize(cmd.ConfigJson);
}
catch (Exception ex)
{
// Defensive: Deserialize is designed not to throw (it classifies Malformed), but
// a verify must never crash the manager — surface the parse failure as ServerError.
_log.Warning(ex, "Verify config for {0} could not be parsed", cmd.ConnectionName);
Sender.Tell(new VerifyEndpointResult(
false, VerifyFailureKind.ServerError,
"The endpoint configuration could not be parsed.", null));
return;
}
var probeLogger = NullLogger.Instance;
RealOpcUaClient
.VerifyEndpointAsync(config, _opcUaGlobalOptions, probeLogger, TimeSpan.FromSeconds(6), CancellationToken.None)
.PipeTo(Sender);
}
private void HandleRemoveConnection(RemoveConnectionCommand command)
{
if (_connectionActors.TryGetValue(command.ConnectionName, out var actor))
@@ -5,7 +5,9 @@ using Microsoft.Extensions.Logging.Abstractions;
using Opc.Ua;
using Opc.Ua.Client;
using Opc.Ua.Configuration;
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Management;
using ZB.MOM.WW.ScadaBridge.Commons.Types.Alarms;
using ZB.MOM.WW.ScadaBridge.Commons.Types.DataConnections;
using ZB.MOM.WW.ScadaBridge.Commons.Types.Enums;
namespace ZB.MOM.WW.ScadaBridge.DataConnectionLayer.Adapters;
@@ -155,6 +157,230 @@ public class RealOpcUaClient : IOpcUaClient
await _subscription.CreateAsync(cancellationToken);
}
/// <summary>
/// T17: Probes an OPC UA endpoint configuration WITHOUT persisting it or creating a
/// long-lived connection — connect, capture the server certificate if it is untrusted,
/// then disconnect. The probe is secure-by-default and READ-ONLY: it forces
/// <c>AutoAcceptUntrustedCertificates = false</c> and a validation hook that captures an
/// untrusted server certificate then REJECTS it (<c>e.Accept = false</c>). It never trusts
/// the certificate — trusting is a separate, later operator action. The session is always
/// disposed in a <c>finally</c>.
/// </summary>
/// <param name="config">The endpoint configuration to probe.</param>
/// <param name="globalOptions">Deployment-wide OPC UA application identity / cert-store paths.</param>
/// <param name="logger">Logger for diagnostics.</param>
/// <param name="timeout">Wall-clock budget for the whole probe (discovery + session create).</param>
/// <param name="ct">External cancellation token, linked with the timeout.</param>
/// <returns>A structured <see cref="VerifyEndpointResult"/> classifying the outcome.</returns>
public static async Task<VerifyEndpointResult> VerifyEndpointAsync(
OpcUaEndpointConfig config,
OpcUaGlobalOptions globalOptions,
ILogger logger,
TimeSpan timeout,
CancellationToken ct)
{
// Captured by the certificate-validation hook below. A non-null value here means
// the server presented an untrusted certificate; it dominates the outcome mapping.
X509Certificate2? capturedCert = null;
ISession? session = null;
Exception? failure = null;
var endpointUrl = string.IsNullOrWhiteSpace(config.EndpointUrl)
? "opc.tcp://localhost:4840"
: config.EndpointUrl;
var preferredSecurityMode = config.SecurityMode switch
{
OpcUaSecurityMode.Sign => MessageSecurityMode.Sign,
OpcUaSecurityMode.SignAndEncrypt => MessageSecurityMode.SignAndEncrypt,
_ => MessageSecurityMode.None
};
// T17: secure-by-default — force AutoAccept=false so an untrusted server cert is
// captured and rejected rather than silently accepted (defeating the whole probe).
var appConfig = new ApplicationConfiguration
{
ApplicationName = string.IsNullOrWhiteSpace(globalOptions.ApplicationName)
? "ScadaBridge-DCL"
: globalOptions.ApplicationName,
ApplicationType = ApplicationType.Client,
SecurityConfiguration = new SecurityConfiguration
{
AutoAcceptUntrustedCertificates = false,
ApplicationCertificate = new CertificateIdentifier(),
TrustedIssuerCertificates = new CertificateTrustList { StorePath = ResolveStorePath(globalOptions.TrustedIssuerStorePath, "issuers") },
TrustedPeerCertificates = new CertificateTrustList { StorePath = ResolveStorePath(globalOptions.TrustedPeerStorePath, "trusted") },
RejectedCertificateStore = new CertificateTrustList { StorePath = ResolveStorePath(globalOptions.RejectedCertificateStorePath, "rejected") }
},
ClientConfiguration = new ClientConfiguration { DefaultSessionTimeout = config.SessionTimeoutMs },
TransportQuotas = new TransportQuotas { OperationTimeout = config.OperationTimeoutMs }
};
// T17: capture the untrusted server cert, then REJECT it (e.Accept = false). The
// validator runs on the SDK's connect thread; copying the cert is the only state we
// keep. Never accept — this probe must not trust anything.
appConfig.CertificateValidator.CertificateValidation += (_, e) =>
{
try
{
// Copy into a stable instance so disposing the SDK's chain doesn't invalidate it.
capturedCert = X509CertificateLoader.LoadCertificate(e.Certificate.RawData);
}
catch
{
// Best-effort capture: fall back to the original reference if the copy fails.
capturedCert = e.Certificate;
}
e.Accept = false;
};
using var linkedCts = CancellationTokenSource.CreateLinkedTokenSource(ct);
linkedCts.CancelAfter(timeout);
try
{
await appConfig.ValidateAsync(ApplicationType.Client);
// Discover endpoints, pick the preferred security mode (same logic as ConnectAsync).
EndpointDescription? endpoint;
try
{
#pragma warning disable CS0618
using var discoveryClient = DiscoveryClient.Create(new Uri(endpointUrl));
var endpoints = discoveryClient.GetEndpoints(null);
#pragma warning restore CS0618
endpoint = endpoints
.Where(ep => ep.SecurityMode == preferredSecurityMode)
.FirstOrDefault() ?? endpoints.FirstOrDefault();
}
catch
{
endpoint = new EndpointDescription(endpointUrl);
}
var endpointConfig = EndpointConfiguration.Create(appConfig);
var configuredEndpoint = new ConfiguredEndpoint(null, endpoint, endpointConfig);
#pragma warning disable CS0618 // Allow obsolete DefaultSessionFactory constructor for compatibility
var sessionFactory = new DefaultSessionFactory();
#pragma warning restore CS0618
var userIdentity = BuildUserIdentity(config.UserIdentity is { } ui
? new OpcUaUserIdentityOptions(
ui.TokenType.ToString(), ui.Username, ui.Password,
ui.CertificatePath, ui.CertificatePassword)
: null);
session = await sessionFactory.CreateAsync(
appConfig, configuredEndpoint, false,
"ScadaBridge-DCL-Verify", (uint)config.SessionTimeoutMs,
userIdentity, null, linkedCts.Token);
}
catch (Exception ex)
{
// OperationCanceledException from the linked CTS firing on timeout is mapped to
// VerifyFailureKind.Timeout inside MapVerifyOutcome.
failure = ex;
logger.LogDebug(ex, "OPC UA verify of {Endpoint} failed.", endpointUrl);
}
finally
{
// T17: ALWAYS dispose the probe session — never leave a connection open.
if (session != null)
{
try { await session.CloseAsync(CancellationToken.None); }
catch (Exception ex) { logger.LogDebug(ex, "OPC UA verify session close failed (ignored)."); }
session.Dispose();
}
}
return MapVerifyOutcome(failure, capturedCert);
}
/// <summary>
/// T17: Pure mapping of a probe outcome — an optional exception plus an optionally
/// captured untrusted server certificate — to a <see cref="VerifyEndpointResult"/>.
/// Factored out so the classification is unit-testable WITHOUT a live OPC UA server.
/// Precedence: a captured certificate ALWAYS yields
/// <see cref="VerifyFailureKind.UntrustedCertificate"/>; otherwise the exception is
/// classified; null exception + null cert means the session was created (success).
/// </summary>
/// <param name="failure">The exception thrown during the probe, or null on success.</param>
/// <param name="capturedCert">The untrusted server certificate captured by the validation hook, or null.</param>
/// <returns>The classified verification result.</returns>
internal static VerifyEndpointResult MapVerifyOutcome(Exception? failure, X509Certificate2? capturedCert)
{
// An untrusted server certificate dominates — regardless of how the connect failed,
// this is the actionable case (the operator may choose to trust it later).
if (capturedCert != null)
{
var info = new ServerCertInfo(
capturedCert.Thumbprint,
capturedCert.Subject,
capturedCert.Issuer,
capturedCert.NotBefore.ToUniversalTime(),
capturedCert.NotAfter.ToUniversalTime(),
Convert.ToBase64String(capturedCert.RawData));
return new VerifyEndpointResult(
false, VerifyFailureKind.UntrustedCertificate,
"The server certificate is not trusted by this site.", info);
}
if (failure is null)
return new VerifyEndpointResult(true, null, null, null);
// Timeout / cancellation (the linked CTS fired, or the SDK reported a request timeout).
if (failure is TimeoutException or OperationCanceledException)
return new VerifyEndpointResult(false, VerifyFailureKind.Timeout, failure.Message, null);
if (failure is ServiceResultException sre)
{
// A socket cause wrapped inside the SDK exception means the host is unreachable.
if (HasSocketCause(sre))
return new VerifyEndpointResult(false, VerifyFailureKind.Unreachable, sre.Message, null);
switch (sre.StatusCode)
{
case StatusCodes.BadRequestTimeout:
case StatusCodes.BadTimeout:
return new VerifyEndpointResult(false, VerifyFailureKind.Timeout, sre.Message, null);
case StatusCodes.BadUserAccessDenied:
case StatusCodes.BadIdentityTokenRejected:
case StatusCodes.BadIdentityTokenInvalid:
return new VerifyEndpointResult(false, VerifyFailureKind.AuthFailed, sre.Message, null);
case StatusCodes.BadConnectionRejected:
case StatusCodes.BadNotConnected:
case StatusCodes.BadConnectionClosed:
case StatusCodes.BadNoCommunication:
case StatusCodes.BadServerNotConnected:
return new VerifyEndpointResult(false, VerifyFailureKind.Unreachable, sre.Message, null);
default:
return new VerifyEndpointResult(false, VerifyFailureKind.ServerError, sre.Message, null);
}
}
// A bare socket failure (DNS / connection refused) before the SDK wrapped it.
if (HasSocketCause(failure))
return new VerifyEndpointResult(false, VerifyFailureKind.Unreachable, failure.Message, null);
return new VerifyEndpointResult(false, VerifyFailureKind.ServerError, failure.Message, null);
}
/// <summary>
/// Walks the exception's <c>InnerException</c> chain looking for a
/// <see cref="System.Net.Sockets.SocketException"/> — the signature of a DNS-resolution
/// or connection-refused failure that means the endpoint host is unreachable.
/// </summary>
private static bool HasSocketCause(Exception ex)
{
for (var cur = ex; cur != null; cur = cur.InnerException)
{
if (cur is System.Net.Sockets.SocketException)
return true;
}
return false;
}
/// <inheritdoc />
public async Task DisconnectAsync(CancellationToken cancellationToken = default)
{
@@ -787,9 +787,15 @@ akka {{
{
var healthCollector = _serviceProvider.GetRequiredService<ZB.MOM.WW.ScadaBridge.HealthMonitoring.ISiteHealthCollector>();
var siteEventLogger = _serviceProvider.GetService<ZB.MOM.WW.ScadaBridge.SiteEventLogging.ISiteEventLogger>();
// T17: the verify-endpoint probe builds an OPC UA ApplicationConfiguration directly,
// so the manager needs the same deployment-wide OpcUaGlobalOptions the
// DataConnectionFactory feeds to RealOpcUaClient when creating connections.
var opcUaGlobalOptions = _serviceProvider
.GetService<IOptions<ZB.MOM.WW.ScadaBridge.DataConnectionLayer.OpcUaGlobalOptions>>()?.Value
?? new ZB.MOM.WW.ScadaBridge.DataConnectionLayer.OpcUaGlobalOptions();
dclManager = _actorSystem!.ActorOf(
Props.Create(() => new ZB.MOM.WW.ScadaBridge.DataConnectionLayer.Actors.DataConnectionManagerActor(
dclFactory, dclOptions, healthCollector, siteEventLogger)),
dclFactory, dclOptions, healthCollector, siteEventLogger, opcUaGlobalOptions)),
"dcl-manager");
_logger.LogInformation("Data Connection Layer manager actor created");
}