Phase 3 PR 72 -- Multi-endpoint failover for OPC UA Client #71

Merged
dohertj2 merged 1 commits from phase-3-pr72-opcua-client-failover into v2 2026-04-19 01:54:37 -04:00
3 changed files with 212 additions and 44 deletions

View File

@@ -59,6 +59,8 @@ public sealed class OpcUaClientDriver(OpcUaClientDriverOptions options, string d
private DriverHealth _health = new(DriverState.Unknown, null, null);
private bool _disposed;
/// <summary>URL of the endpoint the driver actually connected to. Exposed via <see cref="HostName"/>.</summary>
private string? _connectedEndpointUrl;
public string DriverInstanceId => driverInstanceId;
public string DriverType => "OpcUaClient";
@@ -69,51 +71,38 @@ public sealed class OpcUaClientDriver(OpcUaClientDriverOptions options, string d
try
{
var appConfig = await BuildApplicationConfigurationAsync(cancellationToken).ConfigureAwait(false);
var candidates = ResolveEndpointCandidates(_options);
// Endpoint selection: let the stack pick the best matching endpoint for the
// requested security policy/mode so the driver doesn't have to hand-validate.
// UseSecurity=false when SecurityMode=None shortcuts around cert validation
// entirely and is the typical dev-bench configuration.
var selected = await SelectMatchingEndpointAsync(
appConfig, _options.EndpointUrl, _options.SecurityPolicy, _options.SecurityMode,
cancellationToken).ConfigureAwait(false);
var endpointConfig = EndpointConfiguration.Create(appConfig);
endpointConfig.OperationTimeout = (int)_options.Timeout.TotalMilliseconds;
var endpoint = new ConfiguredEndpoint(null, selected, endpointConfig);
var identity = BuildUserIdentity(_options);
var identity = _options.AuthType switch
// Failover sweep: try each endpoint in order, return the session from the first
// one that successfully connects. Per-endpoint failures are captured so the final
// aggregate exception names every URL that was tried and why — critical diag for
// operators debugging 'why did the failover pick #3?'.
var attemptErrors = new List<string>(candidates.Count);
ISession? session = null;
string? connectedUrl = null;
foreach (var url in candidates)
{
OpcUaAuthType.Anonymous => new UserIdentity(new AnonymousIdentityToken()),
// The UserIdentity(string, string) overload was removed in favour of
// (string, byte[]) to make the password encoding explicit. UTF-8 is the
// overwhelmingly common choice for Basic256Sha256-secured sessions.
OpcUaAuthType.Username => new UserIdentity(
_options.Username ?? string.Empty,
System.Text.Encoding.UTF8.GetBytes(_options.Password ?? string.Empty)),
OpcUaAuthType.Certificate => BuildCertificateIdentity(_options),
_ => new UserIdentity(new AnonymousIdentityToken()),
};
try
{
session = await OpenSessionOnEndpointAsync(
appConfig, url, _options.SecurityPolicy, _options.SecurityMode,
identity, cancellationToken).ConfigureAwait(false);
connectedUrl = url;
break;
}
catch (Exception ex)
{
attemptErrors.Add($"{url} -> {ex.GetType().Name}: {ex.Message}");
}
}
// All Session.Create* static methods are marked [Obsolete] in SDK 1.5.378; the
// non-obsolete path is DefaultSessionFactory.Instance.CreateAsync (which is the
// 8-arg signature matching our driver config — ApplicationConfiguration +
// ConfiguredEndpoint, no transport-waiting-connection or reverse-connect-manager
// required for the standard opc.tcp direct-connect case).
// DefaultSessionFactory's parameterless ctor is also obsolete in 1.5.378; the
// current constructor requires an ITelemetryContext. Passing null is tolerated —
// the factory falls back to its internal default sink, same as the telemetry:null
// on SelectEndpointAsync above.
var session = await new DefaultSessionFactory(telemetry: null!).CreateAsync(
appConfig,
endpoint,
false, // updateBeforeConnect
_options.SessionName,
(uint)_options.SessionTimeout.TotalMilliseconds,
identity,
null, // preferredLocales
cancellationToken).ConfigureAwait(false);
session.KeepAliveInterval = (int)_options.KeepAliveInterval.TotalMilliseconds;
if (session is null)
throw new AggregateException(
"OPC UA Client failed to connect to any of the configured endpoints. " +
"Tried:\n " + string.Join("\n ", attemptErrors),
attemptErrors.Select(e => new InvalidOperationException(e)));
// Wire the session's keep-alive channel into HostState. OPC UA keep-alives are
// authoritative for session liveness: the SDK pings on KeepAliveInterval and sets
@@ -128,6 +117,7 @@ public sealed class OpcUaClientDriver(OpcUaClientDriverOptions options, string d
session.KeepAlive += _keepAliveHandler;
Session = session;
_connectedEndpointUrl = connectedUrl;
_health = new DriverHealth(DriverState.Healthy, DateTime.UtcNow, null);
TransitionTo(HostState.Running);
}
@@ -224,6 +214,71 @@ public sealed class OpcUaClientDriver(OpcUaClientDriverOptions options, string d
return config;
}
/// <summary>
/// Resolve the ordered failover candidate list. <c>EndpointUrls</c> wins when
/// non-empty; otherwise fall back to <c>EndpointUrl</c> as a single-URL shortcut so
/// existing single-endpoint configs keep working without migration.
/// </summary>
internal static IReadOnlyList<string> ResolveEndpointCandidates(OpcUaClientDriverOptions opts)
{
if (opts.EndpointUrls is { Count: > 0 }) return opts.EndpointUrls;
return [opts.EndpointUrl];
}
/// <summary>
/// Build the user-identity token from the driver options. Split out of
/// <see cref="InitializeAsync"/> so the failover sweep reuses one identity across
/// every endpoint attempt — generating it N times would re-unlock the user cert's
/// private key N times, wasteful + keeps the password in memory longer.
/// </summary>
internal static UserIdentity BuildUserIdentity(OpcUaClientDriverOptions options) =>
options.AuthType switch
{
OpcUaAuthType.Anonymous => new UserIdentity(new AnonymousIdentityToken()),
OpcUaAuthType.Username => new UserIdentity(
options.Username ?? string.Empty,
System.Text.Encoding.UTF8.GetBytes(options.Password ?? string.Empty)),
OpcUaAuthType.Certificate => BuildCertificateIdentity(options),
_ => new UserIdentity(new AnonymousIdentityToken()),
};
/// <summary>
/// Open a session against a single endpoint URL. Bounded by
/// <see cref="OpcUaClientDriverOptions.PerEndpointConnectTimeout"/> so the failover
/// sweep doesn't spend its full budget on one dead server. Moved out of
/// <see cref="InitializeAsync"/> so the failover loop body stays readable.
/// </summary>
private async Task<ISession> OpenSessionOnEndpointAsync(
ApplicationConfiguration appConfig,
string endpointUrl,
OpcUaSecurityPolicy policy,
OpcUaSecurityMode mode,
UserIdentity identity,
CancellationToken ct)
{
using var cts = CancellationTokenSource.CreateLinkedTokenSource(ct);
cts.CancelAfter(_options.PerEndpointConnectTimeout);
var selected = await SelectMatchingEndpointAsync(
appConfig, endpointUrl, policy, mode, cts.Token).ConfigureAwait(false);
var endpointConfig = EndpointConfiguration.Create(appConfig);
endpointConfig.OperationTimeout = (int)_options.Timeout.TotalMilliseconds;
var endpoint = new ConfiguredEndpoint(null, selected, endpointConfig);
var session = await new DefaultSessionFactory(telemetry: null!).CreateAsync(
appConfig,
endpoint,
false, // updateBeforeConnect
_options.SessionName,
(uint)_options.SessionTimeout.TotalMilliseconds,
identity,
null, // preferredLocales
cts.Token).ConfigureAwait(false);
session.KeepAliveInterval = (int)_options.KeepAliveInterval.TotalMilliseconds;
return session;
}
/// <summary>
/// Select the remote endpoint matching both the requested <paramref name="policy"/>
/// and <paramref name="mode"/>. The SDK's <c>CoreClientUtils.SelectEndpointAsync</c>
@@ -347,6 +402,7 @@ public sealed class OpcUaClientDriver(OpcUaClientDriverOptions options, string d
catch { /* best-effort */ }
try { Session?.Dispose(); } catch { }
Session = null;
_connectedEndpointUrl = null;
TransitionTo(HostState.Unknown);
_health = new DriverHealth(DriverState.Unknown, _health.LastSuccessfulRead, null);
@@ -718,8 +774,16 @@ public sealed class OpcUaClientDriver(OpcUaClientDriverOptions options, string d
// ---- IHostConnectivityProbe ----
/// <summary>Endpoint-URL-keyed host identity for the Admin /hosts dashboard.</summary>
public string HostName => _options.EndpointUrl;
/// <summary>
/// Endpoint-URL-keyed host identity for the Admin /hosts dashboard. Reflects the
/// endpoint the driver actually connected to after the failover sweep — not the
/// first URL in the candidate list — so operators see which of the configured
/// endpoints is currently serving traffic. Falls back to the first configured URL
/// pre-init so the dashboard has something to render before the first connect.
/// </summary>
public string HostName => _connectedEndpointUrl
?? ResolveEndpointCandidates(_options).FirstOrDefault()
?? _options.EndpointUrl;
public IReadOnlyList<HostConnectivityStatus> GetHostStatuses()
{

View File

@@ -13,9 +13,32 @@ namespace ZB.MOM.WW.OtOpcUa.Driver.OpcUaClient;
/// </remarks>
public sealed class OpcUaClientDriverOptions
{
/// <summary>Remote OPC UA endpoint URL, e.g. <c>opc.tcp://plc.internal:4840</c>.</summary>
/// <summary>
/// Remote OPC UA endpoint URL, e.g. <c>opc.tcp://plc.internal:4840</c>. Convenience
/// shortcut for a single-endpoint deployment — equivalent to setting
/// <see cref="EndpointUrls"/> to a list with this one URL. When both are provided,
/// the list wins and <see cref="EndpointUrl"/> is ignored.
/// </summary>
public string EndpointUrl { get; init; } = "opc.tcp://localhost:4840";
/// <summary>
/// Ordered list of candidate endpoint URLs for failover. The driver tries each in
/// order at <see cref="OpcUaClientDriver.InitializeAsync"/> and on session drop;
/// the first URL that successfully connects wins. Typical use-case: an OPC UA server
/// pair running in hot-standby (primary 4840 + backup 4841) where either can serve
/// the same address space. Leave unset (or empty) to use <see cref="EndpointUrl"/>
/// as a single-URL shortcut.
/// </summary>
public IReadOnlyList<string> EndpointUrls { get; init; } = [];
/// <summary>
/// Per-endpoint connect-attempt timeout during the failover sweep. Short enough that
/// cycling through several dead servers doesn't blow the overall init budget, long
/// enough to tolerate a slow TLS handshake on a healthy server. Applied independently
/// of <see cref="Timeout"/> which governs steady-state operations.
/// </summary>
public TimeSpan PerEndpointConnectTimeout { get; init; } = TimeSpan.FromSeconds(3);
/// <summary>
/// Security policy to require when selecting an endpoint. Either a
/// <see cref="OpcUaSecurityPolicy"/> enum constant or a free-form string (for

View File

@@ -0,0 +1,81 @@
using Shouldly;
using Xunit;
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
namespace ZB.MOM.WW.OtOpcUa.Driver.OpcUaClient.Tests;
[Trait("Category", "Unit")]
public sealed class OpcUaClientFailoverTests
{
[Fact]
public void ResolveEndpointCandidates_prefers_EndpointUrls_when_provided()
{
var opts = new OpcUaClientDriverOptions
{
EndpointUrl = "opc.tcp://fallback:4840",
EndpointUrls = ["opc.tcp://primary:4840", "opc.tcp://backup:4841"],
};
var list = OpcUaClientDriver.ResolveEndpointCandidates(opts);
list.Count.ShouldBe(2);
list[0].ShouldBe("opc.tcp://primary:4840");
list[1].ShouldBe("opc.tcp://backup:4841");
}
[Fact]
public void ResolveEndpointCandidates_falls_back_to_single_EndpointUrl_when_list_empty()
{
var opts = new OpcUaClientDriverOptions { EndpointUrl = "opc.tcp://only:4840" };
var list = OpcUaClientDriver.ResolveEndpointCandidates(opts);
list.Count.ShouldBe(1);
list[0].ShouldBe("opc.tcp://only:4840");
}
[Fact]
public void ResolveEndpointCandidates_empty_list_treated_as_fallback_to_EndpointUrl()
{
// Explicit empty list should still fall back to the single-URL shortcut rather than
// producing a zero-candidate sweep that would immediately throw with no URLs tried.
var opts = new OpcUaClientDriverOptions
{
EndpointUrl = "opc.tcp://single:4840",
EndpointUrls = [],
};
OpcUaClientDriver.ResolveEndpointCandidates(opts).Count.ShouldBe(1);
}
[Fact]
public void HostName_uses_first_candidate_before_connect()
{
var opts = new OpcUaClientDriverOptions
{
EndpointUrls = ["opc.tcp://primary:4840", "opc.tcp://backup:4841"],
};
using var drv = new OpcUaClientDriver(opts, "opcua-host");
drv.HostName.ShouldBe("opc.tcp://primary:4840",
"pre-connect the dashboard should show the first candidate URL so operators can link back");
}
[Fact]
public async Task Initialize_against_all_unreachable_endpoints_throws_AggregateException_listing_each()
{
// Port 1 + port 2 + port 3 on loopback are all guaranteed closed (TCP RST immediate).
// Failover sweep should attempt all three and throw AggregateException naming each URL
// so operators see exactly which candidates were tried.
var opts = new OpcUaClientDriverOptions
{
EndpointUrls = ["opc.tcp://127.0.0.1:1", "opc.tcp://127.0.0.1:2", "opc.tcp://127.0.0.1:3"],
PerEndpointConnectTimeout = TimeSpan.FromMilliseconds(500),
Timeout = TimeSpan.FromMilliseconds(500),
AutoAcceptCertificates = true,
};
using var drv = new OpcUaClientDriver(opts, "opcua-failover");
var ex = await Should.ThrowAsync<AggregateException>(async () =>
await drv.InitializeAsync("{}", TestContext.Current.CancellationToken));
ex.Message.ShouldContain("127.0.0.1:1");
ex.Message.ShouldContain("127.0.0.1:2");
ex.Message.ShouldContain("127.0.0.1:3");
drv.GetHealth().State.ShouldBe(DriverState.Faulted);
}
}