diff --git a/src/ZB.MOM.WW.OtOpcUa.Driver.OpcUaClient/OpcUaClientDriver.cs b/src/ZB.MOM.WW.OtOpcUa.Driver.OpcUaClient/OpcUaClientDriver.cs
index 7a96725..e4815d8 100644
--- a/src/ZB.MOM.WW.OtOpcUa.Driver.OpcUaClient/OpcUaClientDriver.cs
+++ b/src/ZB.MOM.WW.OtOpcUa.Driver.OpcUaClient/OpcUaClientDriver.cs
@@ -59,6 +59,8 @@ public sealed class OpcUaClientDriver(OpcUaClientDriverOptions options, string d
private DriverHealth _health = new(DriverState.Unknown, null, null);
private bool _disposed;
+ /// URL of the endpoint the driver actually connected to. Exposed via .
+ private string? _connectedEndpointUrl;
public string DriverInstanceId => driverInstanceId;
public string DriverType => "OpcUaClient";
@@ -69,51 +71,38 @@ public sealed class OpcUaClientDriver(OpcUaClientDriverOptions options, string d
try
{
var appConfig = await BuildApplicationConfigurationAsync(cancellationToken).ConfigureAwait(false);
+ var candidates = ResolveEndpointCandidates(_options);
- // Endpoint selection: let the stack pick the best matching endpoint for the
- // requested security policy/mode so the driver doesn't have to hand-validate.
- // UseSecurity=false when SecurityMode=None shortcuts around cert validation
- // entirely and is the typical dev-bench configuration.
- var selected = await SelectMatchingEndpointAsync(
- appConfig, _options.EndpointUrl, _options.SecurityPolicy, _options.SecurityMode,
- cancellationToken).ConfigureAwait(false);
- var endpointConfig = EndpointConfiguration.Create(appConfig);
- endpointConfig.OperationTimeout = (int)_options.Timeout.TotalMilliseconds;
- var endpoint = new ConfiguredEndpoint(null, selected, endpointConfig);
+ var identity = BuildUserIdentity(_options);
- var identity = _options.AuthType switch
+ // Failover sweep: try each endpoint in order, return the session from the first
+ // one that successfully connects. Per-endpoint failures are captured so the final
+ // aggregate exception names every URL that was tried and why — critical diag for
+ // operators debugging 'why did the failover pick #3?'.
+ var attemptErrors = new List(candidates.Count);
+ ISession? session = null;
+ string? connectedUrl = null;
+ foreach (var url in candidates)
{
- OpcUaAuthType.Anonymous => new UserIdentity(new AnonymousIdentityToken()),
- // The UserIdentity(string, string) overload was removed in favour of
- // (string, byte[]) to make the password encoding explicit. UTF-8 is the
- // overwhelmingly common choice for Basic256Sha256-secured sessions.
- OpcUaAuthType.Username => new UserIdentity(
- _options.Username ?? string.Empty,
- System.Text.Encoding.UTF8.GetBytes(_options.Password ?? string.Empty)),
- OpcUaAuthType.Certificate => BuildCertificateIdentity(_options),
- _ => new UserIdentity(new AnonymousIdentityToken()),
- };
+ try
+ {
+ session = await OpenSessionOnEndpointAsync(
+ appConfig, url, _options.SecurityPolicy, _options.SecurityMode,
+ identity, cancellationToken).ConfigureAwait(false);
+ connectedUrl = url;
+ break;
+ }
+ catch (Exception ex)
+ {
+ attemptErrors.Add($"{url} -> {ex.GetType().Name}: {ex.Message}");
+ }
+ }
- // All Session.Create* static methods are marked [Obsolete] in SDK 1.5.378; the
- // non-obsolete path is DefaultSessionFactory.Instance.CreateAsync (which is the
- // 8-arg signature matching our driver config — ApplicationConfiguration +
- // ConfiguredEndpoint, no transport-waiting-connection or reverse-connect-manager
- // required for the standard opc.tcp direct-connect case).
- // DefaultSessionFactory's parameterless ctor is also obsolete in 1.5.378; the
- // current constructor requires an ITelemetryContext. Passing null is tolerated —
- // the factory falls back to its internal default sink, same as the telemetry:null
- // on SelectEndpointAsync above.
- var session = await new DefaultSessionFactory(telemetry: null!).CreateAsync(
- appConfig,
- endpoint,
- false, // updateBeforeConnect
- _options.SessionName,
- (uint)_options.SessionTimeout.TotalMilliseconds,
- identity,
- null, // preferredLocales
- cancellationToken).ConfigureAwait(false);
-
- session.KeepAliveInterval = (int)_options.KeepAliveInterval.TotalMilliseconds;
+ if (session is null)
+ throw new AggregateException(
+ "OPC UA Client failed to connect to any of the configured endpoints. " +
+ "Tried:\n " + string.Join("\n ", attemptErrors),
+ attemptErrors.Select(e => new InvalidOperationException(e)));
// Wire the session's keep-alive channel into HostState. OPC UA keep-alives are
// authoritative for session liveness: the SDK pings on KeepAliveInterval and sets
@@ -128,6 +117,7 @@ public sealed class OpcUaClientDriver(OpcUaClientDriverOptions options, string d
session.KeepAlive += _keepAliveHandler;
Session = session;
+ _connectedEndpointUrl = connectedUrl;
_health = new DriverHealth(DriverState.Healthy, DateTime.UtcNow, null);
TransitionTo(HostState.Running);
}
@@ -224,6 +214,71 @@ public sealed class OpcUaClientDriver(OpcUaClientDriverOptions options, string d
return config;
}
+ ///
+ /// Resolve the ordered failover candidate list. EndpointUrls wins when
+ /// non-empty; otherwise fall back to EndpointUrl as a single-URL shortcut so
+ /// existing single-endpoint configs keep working without migration.
+ ///
+ internal static IReadOnlyList ResolveEndpointCandidates(OpcUaClientDriverOptions opts)
+ {
+ if (opts.EndpointUrls is { Count: > 0 }) return opts.EndpointUrls;
+ return [opts.EndpointUrl];
+ }
+
+ ///
+ /// Build the user-identity token from the driver options. Split out of
+ /// so the failover sweep reuses one identity across
+ /// every endpoint attempt — generating it N times would re-unlock the user cert's
+ /// private key N times, wasteful + keeps the password in memory longer.
+ ///
+ internal static UserIdentity BuildUserIdentity(OpcUaClientDriverOptions options) =>
+ options.AuthType switch
+ {
+ OpcUaAuthType.Anonymous => new UserIdentity(new AnonymousIdentityToken()),
+ OpcUaAuthType.Username => new UserIdentity(
+ options.Username ?? string.Empty,
+ System.Text.Encoding.UTF8.GetBytes(options.Password ?? string.Empty)),
+ OpcUaAuthType.Certificate => BuildCertificateIdentity(options),
+ _ => new UserIdentity(new AnonymousIdentityToken()),
+ };
+
+ ///
+ /// Open a session against a single endpoint URL. Bounded by
+ /// so the failover
+ /// sweep doesn't spend its full budget on one dead server. Moved out of
+ /// so the failover loop body stays readable.
+ ///
+ private async Task OpenSessionOnEndpointAsync(
+ ApplicationConfiguration appConfig,
+ string endpointUrl,
+ OpcUaSecurityPolicy policy,
+ OpcUaSecurityMode mode,
+ UserIdentity identity,
+ CancellationToken ct)
+ {
+ using var cts = CancellationTokenSource.CreateLinkedTokenSource(ct);
+ cts.CancelAfter(_options.PerEndpointConnectTimeout);
+
+ var selected = await SelectMatchingEndpointAsync(
+ appConfig, endpointUrl, policy, mode, cts.Token).ConfigureAwait(false);
+ var endpointConfig = EndpointConfiguration.Create(appConfig);
+ endpointConfig.OperationTimeout = (int)_options.Timeout.TotalMilliseconds;
+ var endpoint = new ConfiguredEndpoint(null, selected, endpointConfig);
+
+ var session = await new DefaultSessionFactory(telemetry: null!).CreateAsync(
+ appConfig,
+ endpoint,
+ false, // updateBeforeConnect
+ _options.SessionName,
+ (uint)_options.SessionTimeout.TotalMilliseconds,
+ identity,
+ null, // preferredLocales
+ cts.Token).ConfigureAwait(false);
+
+ session.KeepAliveInterval = (int)_options.KeepAliveInterval.TotalMilliseconds;
+ return session;
+ }
+
///
/// Select the remote endpoint matching both the requested
/// and . The SDK's CoreClientUtils.SelectEndpointAsync
@@ -347,6 +402,7 @@ public sealed class OpcUaClientDriver(OpcUaClientDriverOptions options, string d
catch { /* best-effort */ }
try { Session?.Dispose(); } catch { }
Session = null;
+ _connectedEndpointUrl = null;
TransitionTo(HostState.Unknown);
_health = new DriverHealth(DriverState.Unknown, _health.LastSuccessfulRead, null);
@@ -718,8 +774,16 @@ public sealed class OpcUaClientDriver(OpcUaClientDriverOptions options, string d
// ---- IHostConnectivityProbe ----
- /// Endpoint-URL-keyed host identity for the Admin /hosts dashboard.
- public string HostName => _options.EndpointUrl;
+ ///
+ /// Endpoint-URL-keyed host identity for the Admin /hosts dashboard. Reflects the
+ /// endpoint the driver actually connected to after the failover sweep — not the
+ /// first URL in the candidate list — so operators see which of the configured
+ /// endpoints is currently serving traffic. Falls back to the first configured URL
+ /// pre-init so the dashboard has something to render before the first connect.
+ ///
+ public string HostName => _connectedEndpointUrl
+ ?? ResolveEndpointCandidates(_options).FirstOrDefault()
+ ?? _options.EndpointUrl;
public IReadOnlyList GetHostStatuses()
{
diff --git a/src/ZB.MOM.WW.OtOpcUa.Driver.OpcUaClient/OpcUaClientDriverOptions.cs b/src/ZB.MOM.WW.OtOpcUa.Driver.OpcUaClient/OpcUaClientDriverOptions.cs
index d61287b..877bd4e 100644
--- a/src/ZB.MOM.WW.OtOpcUa.Driver.OpcUaClient/OpcUaClientDriverOptions.cs
+++ b/src/ZB.MOM.WW.OtOpcUa.Driver.OpcUaClient/OpcUaClientDriverOptions.cs
@@ -13,9 +13,32 @@ namespace ZB.MOM.WW.OtOpcUa.Driver.OpcUaClient;
///
public sealed class OpcUaClientDriverOptions
{
- /// Remote OPC UA endpoint URL, e.g. opc.tcp://plc.internal:4840.
+ ///
+ /// Remote OPC UA endpoint URL, e.g. opc.tcp://plc.internal:4840. Convenience
+ /// shortcut for a single-endpoint deployment — equivalent to setting
+ /// to a list with this one URL. When both are provided,
+ /// the list wins and is ignored.
+ ///
public string EndpointUrl { get; init; } = "opc.tcp://localhost:4840";
+ ///
+ /// Ordered list of candidate endpoint URLs for failover. The driver tries each in
+ /// order at and on session drop;
+ /// the first URL that successfully connects wins. Typical use-case: an OPC UA server
+ /// pair running in hot-standby (primary 4840 + backup 4841) where either can serve
+ /// the same address space. Leave unset (or empty) to use
+ /// as a single-URL shortcut.
+ ///
+ public IReadOnlyList EndpointUrls { get; init; } = [];
+
+ ///
+ /// Per-endpoint connect-attempt timeout during the failover sweep. Short enough that
+ /// cycling through several dead servers doesn't blow the overall init budget, long
+ /// enough to tolerate a slow TLS handshake on a healthy server. Applied independently
+ /// of which governs steady-state operations.
+ ///
+ public TimeSpan PerEndpointConnectTimeout { get; init; } = TimeSpan.FromSeconds(3);
+
///
/// Security policy to require when selecting an endpoint. Either a
/// enum constant or a free-form string (for
diff --git a/tests/ZB.MOM.WW.OtOpcUa.Driver.OpcUaClient.Tests/OpcUaClientFailoverTests.cs b/tests/ZB.MOM.WW.OtOpcUa.Driver.OpcUaClient.Tests/OpcUaClientFailoverTests.cs
new file mode 100644
index 0000000..73edb82
--- /dev/null
+++ b/tests/ZB.MOM.WW.OtOpcUa.Driver.OpcUaClient.Tests/OpcUaClientFailoverTests.cs
@@ -0,0 +1,81 @@
+using Shouldly;
+using Xunit;
+using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
+
+namespace ZB.MOM.WW.OtOpcUa.Driver.OpcUaClient.Tests;
+
+[Trait("Category", "Unit")]
+public sealed class OpcUaClientFailoverTests
+{
+ [Fact]
+ public void ResolveEndpointCandidates_prefers_EndpointUrls_when_provided()
+ {
+ var opts = new OpcUaClientDriverOptions
+ {
+ EndpointUrl = "opc.tcp://fallback:4840",
+ EndpointUrls = ["opc.tcp://primary:4840", "opc.tcp://backup:4841"],
+ };
+ var list = OpcUaClientDriver.ResolveEndpointCandidates(opts);
+ list.Count.ShouldBe(2);
+ list[0].ShouldBe("opc.tcp://primary:4840");
+ list[1].ShouldBe("opc.tcp://backup:4841");
+ }
+
+ [Fact]
+ public void ResolveEndpointCandidates_falls_back_to_single_EndpointUrl_when_list_empty()
+ {
+ var opts = new OpcUaClientDriverOptions { EndpointUrl = "opc.tcp://only:4840" };
+ var list = OpcUaClientDriver.ResolveEndpointCandidates(opts);
+ list.Count.ShouldBe(1);
+ list[0].ShouldBe("opc.tcp://only:4840");
+ }
+
+ [Fact]
+ public void ResolveEndpointCandidates_empty_list_treated_as_fallback_to_EndpointUrl()
+ {
+ // Explicit empty list should still fall back to the single-URL shortcut rather than
+ // producing a zero-candidate sweep that would immediately throw with no URLs tried.
+ var opts = new OpcUaClientDriverOptions
+ {
+ EndpointUrl = "opc.tcp://single:4840",
+ EndpointUrls = [],
+ };
+ OpcUaClientDriver.ResolveEndpointCandidates(opts).Count.ShouldBe(1);
+ }
+
+ [Fact]
+ public void HostName_uses_first_candidate_before_connect()
+ {
+ var opts = new OpcUaClientDriverOptions
+ {
+ EndpointUrls = ["opc.tcp://primary:4840", "opc.tcp://backup:4841"],
+ };
+ using var drv = new OpcUaClientDriver(opts, "opcua-host");
+ drv.HostName.ShouldBe("opc.tcp://primary:4840",
+ "pre-connect the dashboard should show the first candidate URL so operators can link back");
+ }
+
+ [Fact]
+ public async Task Initialize_against_all_unreachable_endpoints_throws_AggregateException_listing_each()
+ {
+ // Port 1 + port 2 + port 3 on loopback are all guaranteed closed (TCP RST immediate).
+ // Failover sweep should attempt all three and throw AggregateException naming each URL
+ // so operators see exactly which candidates were tried.
+ var opts = new OpcUaClientDriverOptions
+ {
+ EndpointUrls = ["opc.tcp://127.0.0.1:1", "opc.tcp://127.0.0.1:2", "opc.tcp://127.0.0.1:3"],
+ PerEndpointConnectTimeout = TimeSpan.FromMilliseconds(500),
+ Timeout = TimeSpan.FromMilliseconds(500),
+ AutoAcceptCertificates = true,
+ };
+ using var drv = new OpcUaClientDriver(opts, "opcua-failover");
+
+ var ex = await Should.ThrowAsync(async () =>
+ await drv.InitializeAsync("{}", TestContext.Current.CancellationToken));
+
+ ex.Message.ShouldContain("127.0.0.1:1");
+ ex.Message.ShouldContain("127.0.0.1:2");
+ ex.Message.ShouldContain("127.0.0.1:3");
+ drv.GetHealth().State.ShouldBe(DriverState.Faulted);
+ }
+}