From 24435712c43d35a6600c3c3c633a4bf6ffbe08ec Mon Sep 17 00:00:00 2001 From: Joseph Doherty Date: Sun, 19 Apr 2026 01:52:31 -0400 Subject: [PATCH] Phase 3 PR 72 -- Multi-endpoint failover for OPC UA Client driver. Adds OpcUaClientDriverOptions.EndpointUrls ordered list + PerEndpointConnectTimeout knob. On InitializeAsync the driver walks the candidate list in order via ResolveEndpointCandidates and returns the session from the first endpoint that successfully connects. Captures per-URL failure reasons in a List and, if every candidate fails, throws AggregateException whose message names every URL + its failure class (e.g. 'opc.tcp://primary:4840 -> TimeoutException: ...'). That's critical diag for field debugging -- without it 'failover picked the wrong one' surfaces as a mystery. Single-URL backwards compat: EndpointUrl field retained as a one-URL shortcut. When EndpointUrls is null or empty the driver falls through to a single-candidate list of [EndpointUrl], so every existing single-endpoint config keeps working without migration. When both are provided, EndpointUrls wins + EndpointUrl is ignored -- documented on the field xml-doc. Per-endpoint connect budget: PerEndpointConnectTimeout (default 3s) caps each attempt so a sweep over several dead servers can't blow the overall init budget. Applied via CancellationTokenSource.CreateLinkedTokenSource + CancelAfter inside OpenSessionOnEndpointAsync (the extracted single-endpoint connect helper) so the cap is independent of the outer Options.Timeout which governs steady-state ops. BuildUserIdentity extracted out of InitializeAsync so the failover loop builds the UserIdentity ONCE and reuses it across every endpoint attempt -- generating it N times would re-unlock the user cert's private key N times, wasteful + keeps the password in memory longer. HostName now reflects the endpoint that actually connected via _connectedEndpointUrl instead of always returning opts.EndpointUrl -- so the Admin /hosts dashboard shows which of the configured endpoints is currently serving traffic (primary vs backup). Falls back to the first candidate pre-connect so the dashboard has a sensible identity before the first connect, and resets to null on ShutdownAsync. Use case: an OPC UA hot-standby server pair (primary 4840 + backup 4841) where either can serve the same address space. Operator configures EndpointUrls=[primary, backup]; driver tries primary first, falls over to backup on primary failure with a clean AggregateException describing both attempts if both are down. Unit tests (OpcUaClientFailoverTests, 5 facts): ResolveEndpointCandidates_prefers_EndpointUrls_when_provided (list trumps single), ResolveEndpointCandidates_falls_back_to_single_EndpointUrl_when_list_empty (legacy config compat), ResolveEndpointCandidates_empty_list_treated_as_fallback (explicit empty list also falls back -- otherwise we'd produce a zero-candidate sweep that throws with nothing tried), HostName_uses_first_candidate_before_connect (dashboard rendering pre-connect), Initialize_against_all_unreachable_endpoints_throws_AggregateException_listing_each (three loopback dead ports, asserts each URL appears in the aggregate message + driver flips to Faulted). 31/31 OpcUaClient.Tests pass. dotnet build clean. OPC UA Client driver security/auth/availability feature set now complete per driver-specs.md \u00A78: policy-filtered endpoint selection (PR 70), Anonymous+Username+Certificate auth (PR 71), multi-endpoint failover (this PR). --- .../OpcUaClientDriver.cs | 150 +++++++++++++----- .../OpcUaClientDriverOptions.cs | 25 ++- .../OpcUaClientFailoverTests.cs | 81 ++++++++++ 3 files changed, 212 insertions(+), 44 deletions(-) create mode 100644 tests/ZB.MOM.WW.OtOpcUa.Driver.OpcUaClient.Tests/OpcUaClientFailoverTests.cs diff --git a/src/ZB.MOM.WW.OtOpcUa.Driver.OpcUaClient/OpcUaClientDriver.cs b/src/ZB.MOM.WW.OtOpcUa.Driver.OpcUaClient/OpcUaClientDriver.cs index 7a96725..e4815d8 100644 --- a/src/ZB.MOM.WW.OtOpcUa.Driver.OpcUaClient/OpcUaClientDriver.cs +++ b/src/ZB.MOM.WW.OtOpcUa.Driver.OpcUaClient/OpcUaClientDriver.cs @@ -59,6 +59,8 @@ public sealed class OpcUaClientDriver(OpcUaClientDriverOptions options, string d private DriverHealth _health = new(DriverState.Unknown, null, null); private bool _disposed; + /// URL of the endpoint the driver actually connected to. Exposed via . + private string? _connectedEndpointUrl; public string DriverInstanceId => driverInstanceId; public string DriverType => "OpcUaClient"; @@ -69,51 +71,38 @@ public sealed class OpcUaClientDriver(OpcUaClientDriverOptions options, string d try { var appConfig = await BuildApplicationConfigurationAsync(cancellationToken).ConfigureAwait(false); + var candidates = ResolveEndpointCandidates(_options); - // Endpoint selection: let the stack pick the best matching endpoint for the - // requested security policy/mode so the driver doesn't have to hand-validate. - // UseSecurity=false when SecurityMode=None shortcuts around cert validation - // entirely and is the typical dev-bench configuration. - var selected = await SelectMatchingEndpointAsync( - appConfig, _options.EndpointUrl, _options.SecurityPolicy, _options.SecurityMode, - cancellationToken).ConfigureAwait(false); - var endpointConfig = EndpointConfiguration.Create(appConfig); - endpointConfig.OperationTimeout = (int)_options.Timeout.TotalMilliseconds; - var endpoint = new ConfiguredEndpoint(null, selected, endpointConfig); + var identity = BuildUserIdentity(_options); - var identity = _options.AuthType switch + // Failover sweep: try each endpoint in order, return the session from the first + // one that successfully connects. Per-endpoint failures are captured so the final + // aggregate exception names every URL that was tried and why — critical diag for + // operators debugging 'why did the failover pick #3?'. + var attemptErrors = new List(candidates.Count); + ISession? session = null; + string? connectedUrl = null; + foreach (var url in candidates) { - OpcUaAuthType.Anonymous => new UserIdentity(new AnonymousIdentityToken()), - // The UserIdentity(string, string) overload was removed in favour of - // (string, byte[]) to make the password encoding explicit. UTF-8 is the - // overwhelmingly common choice for Basic256Sha256-secured sessions. - OpcUaAuthType.Username => new UserIdentity( - _options.Username ?? string.Empty, - System.Text.Encoding.UTF8.GetBytes(_options.Password ?? string.Empty)), - OpcUaAuthType.Certificate => BuildCertificateIdentity(_options), - _ => new UserIdentity(new AnonymousIdentityToken()), - }; + try + { + session = await OpenSessionOnEndpointAsync( + appConfig, url, _options.SecurityPolicy, _options.SecurityMode, + identity, cancellationToken).ConfigureAwait(false); + connectedUrl = url; + break; + } + catch (Exception ex) + { + attemptErrors.Add($"{url} -> {ex.GetType().Name}: {ex.Message}"); + } + } - // All Session.Create* static methods are marked [Obsolete] in SDK 1.5.378; the - // non-obsolete path is DefaultSessionFactory.Instance.CreateAsync (which is the - // 8-arg signature matching our driver config — ApplicationConfiguration + - // ConfiguredEndpoint, no transport-waiting-connection or reverse-connect-manager - // required for the standard opc.tcp direct-connect case). - // DefaultSessionFactory's parameterless ctor is also obsolete in 1.5.378; the - // current constructor requires an ITelemetryContext. Passing null is tolerated — - // the factory falls back to its internal default sink, same as the telemetry:null - // on SelectEndpointAsync above. - var session = await new DefaultSessionFactory(telemetry: null!).CreateAsync( - appConfig, - endpoint, - false, // updateBeforeConnect - _options.SessionName, - (uint)_options.SessionTimeout.TotalMilliseconds, - identity, - null, // preferredLocales - cancellationToken).ConfigureAwait(false); - - session.KeepAliveInterval = (int)_options.KeepAliveInterval.TotalMilliseconds; + if (session is null) + throw new AggregateException( + "OPC UA Client failed to connect to any of the configured endpoints. " + + "Tried:\n " + string.Join("\n ", attemptErrors), + attemptErrors.Select(e => new InvalidOperationException(e))); // Wire the session's keep-alive channel into HostState. OPC UA keep-alives are // authoritative for session liveness: the SDK pings on KeepAliveInterval and sets @@ -128,6 +117,7 @@ public sealed class OpcUaClientDriver(OpcUaClientDriverOptions options, string d session.KeepAlive += _keepAliveHandler; Session = session; + _connectedEndpointUrl = connectedUrl; _health = new DriverHealth(DriverState.Healthy, DateTime.UtcNow, null); TransitionTo(HostState.Running); } @@ -224,6 +214,71 @@ public sealed class OpcUaClientDriver(OpcUaClientDriverOptions options, string d return config; } + /// + /// Resolve the ordered failover candidate list. EndpointUrls wins when + /// non-empty; otherwise fall back to EndpointUrl as a single-URL shortcut so + /// existing single-endpoint configs keep working without migration. + /// + internal static IReadOnlyList ResolveEndpointCandidates(OpcUaClientDriverOptions opts) + { + if (opts.EndpointUrls is { Count: > 0 }) return opts.EndpointUrls; + return [opts.EndpointUrl]; + } + + /// + /// Build the user-identity token from the driver options. Split out of + /// so the failover sweep reuses one identity across + /// every endpoint attempt — generating it N times would re-unlock the user cert's + /// private key N times, wasteful + keeps the password in memory longer. + /// + internal static UserIdentity BuildUserIdentity(OpcUaClientDriverOptions options) => + options.AuthType switch + { + OpcUaAuthType.Anonymous => new UserIdentity(new AnonymousIdentityToken()), + OpcUaAuthType.Username => new UserIdentity( + options.Username ?? string.Empty, + System.Text.Encoding.UTF8.GetBytes(options.Password ?? string.Empty)), + OpcUaAuthType.Certificate => BuildCertificateIdentity(options), + _ => new UserIdentity(new AnonymousIdentityToken()), + }; + + /// + /// Open a session against a single endpoint URL. Bounded by + /// so the failover + /// sweep doesn't spend its full budget on one dead server. Moved out of + /// so the failover loop body stays readable. + /// + private async Task OpenSessionOnEndpointAsync( + ApplicationConfiguration appConfig, + string endpointUrl, + OpcUaSecurityPolicy policy, + OpcUaSecurityMode mode, + UserIdentity identity, + CancellationToken ct) + { + using var cts = CancellationTokenSource.CreateLinkedTokenSource(ct); + cts.CancelAfter(_options.PerEndpointConnectTimeout); + + var selected = await SelectMatchingEndpointAsync( + appConfig, endpointUrl, policy, mode, cts.Token).ConfigureAwait(false); + var endpointConfig = EndpointConfiguration.Create(appConfig); + endpointConfig.OperationTimeout = (int)_options.Timeout.TotalMilliseconds; + var endpoint = new ConfiguredEndpoint(null, selected, endpointConfig); + + var session = await new DefaultSessionFactory(telemetry: null!).CreateAsync( + appConfig, + endpoint, + false, // updateBeforeConnect + _options.SessionName, + (uint)_options.SessionTimeout.TotalMilliseconds, + identity, + null, // preferredLocales + cts.Token).ConfigureAwait(false); + + session.KeepAliveInterval = (int)_options.KeepAliveInterval.TotalMilliseconds; + return session; + } + /// /// Select the remote endpoint matching both the requested /// and . The SDK's CoreClientUtils.SelectEndpointAsync @@ -347,6 +402,7 @@ public sealed class OpcUaClientDriver(OpcUaClientDriverOptions options, string d catch { /* best-effort */ } try { Session?.Dispose(); } catch { } Session = null; + _connectedEndpointUrl = null; TransitionTo(HostState.Unknown); _health = new DriverHealth(DriverState.Unknown, _health.LastSuccessfulRead, null); @@ -718,8 +774,16 @@ public sealed class OpcUaClientDriver(OpcUaClientDriverOptions options, string d // ---- IHostConnectivityProbe ---- - /// Endpoint-URL-keyed host identity for the Admin /hosts dashboard. - public string HostName => _options.EndpointUrl; + /// + /// Endpoint-URL-keyed host identity for the Admin /hosts dashboard. Reflects the + /// endpoint the driver actually connected to after the failover sweep — not the + /// first URL in the candidate list — so operators see which of the configured + /// endpoints is currently serving traffic. Falls back to the first configured URL + /// pre-init so the dashboard has something to render before the first connect. + /// + public string HostName => _connectedEndpointUrl + ?? ResolveEndpointCandidates(_options).FirstOrDefault() + ?? _options.EndpointUrl; public IReadOnlyList GetHostStatuses() { diff --git a/src/ZB.MOM.WW.OtOpcUa.Driver.OpcUaClient/OpcUaClientDriverOptions.cs b/src/ZB.MOM.WW.OtOpcUa.Driver.OpcUaClient/OpcUaClientDriverOptions.cs index d61287b..877bd4e 100644 --- a/src/ZB.MOM.WW.OtOpcUa.Driver.OpcUaClient/OpcUaClientDriverOptions.cs +++ b/src/ZB.MOM.WW.OtOpcUa.Driver.OpcUaClient/OpcUaClientDriverOptions.cs @@ -13,9 +13,32 @@ namespace ZB.MOM.WW.OtOpcUa.Driver.OpcUaClient; /// public sealed class OpcUaClientDriverOptions { - /// Remote OPC UA endpoint URL, e.g. opc.tcp://plc.internal:4840. + /// + /// Remote OPC UA endpoint URL, e.g. opc.tcp://plc.internal:4840. Convenience + /// shortcut for a single-endpoint deployment — equivalent to setting + /// to a list with this one URL. When both are provided, + /// the list wins and is ignored. + /// public string EndpointUrl { get; init; } = "opc.tcp://localhost:4840"; + /// + /// Ordered list of candidate endpoint URLs for failover. The driver tries each in + /// order at and on session drop; + /// the first URL that successfully connects wins. Typical use-case: an OPC UA server + /// pair running in hot-standby (primary 4840 + backup 4841) where either can serve + /// the same address space. Leave unset (or empty) to use + /// as a single-URL shortcut. + /// + public IReadOnlyList EndpointUrls { get; init; } = []; + + /// + /// Per-endpoint connect-attempt timeout during the failover sweep. Short enough that + /// cycling through several dead servers doesn't blow the overall init budget, long + /// enough to tolerate a slow TLS handshake on a healthy server. Applied independently + /// of which governs steady-state operations. + /// + public TimeSpan PerEndpointConnectTimeout { get; init; } = TimeSpan.FromSeconds(3); + /// /// Security policy to require when selecting an endpoint. Either a /// enum constant or a free-form string (for diff --git a/tests/ZB.MOM.WW.OtOpcUa.Driver.OpcUaClient.Tests/OpcUaClientFailoverTests.cs b/tests/ZB.MOM.WW.OtOpcUa.Driver.OpcUaClient.Tests/OpcUaClientFailoverTests.cs new file mode 100644 index 0000000..73edb82 --- /dev/null +++ b/tests/ZB.MOM.WW.OtOpcUa.Driver.OpcUaClient.Tests/OpcUaClientFailoverTests.cs @@ -0,0 +1,81 @@ +using Shouldly; +using Xunit; +using ZB.MOM.WW.OtOpcUa.Core.Abstractions; + +namespace ZB.MOM.WW.OtOpcUa.Driver.OpcUaClient.Tests; + +[Trait("Category", "Unit")] +public sealed class OpcUaClientFailoverTests +{ + [Fact] + public void ResolveEndpointCandidates_prefers_EndpointUrls_when_provided() + { + var opts = new OpcUaClientDriverOptions + { + EndpointUrl = "opc.tcp://fallback:4840", + EndpointUrls = ["opc.tcp://primary:4840", "opc.tcp://backup:4841"], + }; + var list = OpcUaClientDriver.ResolveEndpointCandidates(opts); + list.Count.ShouldBe(2); + list[0].ShouldBe("opc.tcp://primary:4840"); + list[1].ShouldBe("opc.tcp://backup:4841"); + } + + [Fact] + public void ResolveEndpointCandidates_falls_back_to_single_EndpointUrl_when_list_empty() + { + var opts = new OpcUaClientDriverOptions { EndpointUrl = "opc.tcp://only:4840" }; + var list = OpcUaClientDriver.ResolveEndpointCandidates(opts); + list.Count.ShouldBe(1); + list[0].ShouldBe("opc.tcp://only:4840"); + } + + [Fact] + public void ResolveEndpointCandidates_empty_list_treated_as_fallback_to_EndpointUrl() + { + // Explicit empty list should still fall back to the single-URL shortcut rather than + // producing a zero-candidate sweep that would immediately throw with no URLs tried. + var opts = new OpcUaClientDriverOptions + { + EndpointUrl = "opc.tcp://single:4840", + EndpointUrls = [], + }; + OpcUaClientDriver.ResolveEndpointCandidates(opts).Count.ShouldBe(1); + } + + [Fact] + public void HostName_uses_first_candidate_before_connect() + { + var opts = new OpcUaClientDriverOptions + { + EndpointUrls = ["opc.tcp://primary:4840", "opc.tcp://backup:4841"], + }; + using var drv = new OpcUaClientDriver(opts, "opcua-host"); + drv.HostName.ShouldBe("opc.tcp://primary:4840", + "pre-connect the dashboard should show the first candidate URL so operators can link back"); + } + + [Fact] + public async Task Initialize_against_all_unreachable_endpoints_throws_AggregateException_listing_each() + { + // Port 1 + port 2 + port 3 on loopback are all guaranteed closed (TCP RST immediate). + // Failover sweep should attempt all three and throw AggregateException naming each URL + // so operators see exactly which candidates were tried. + var opts = new OpcUaClientDriverOptions + { + EndpointUrls = ["opc.tcp://127.0.0.1:1", "opc.tcp://127.0.0.1:2", "opc.tcp://127.0.0.1:3"], + PerEndpointConnectTimeout = TimeSpan.FromMilliseconds(500), + Timeout = TimeSpan.FromMilliseconds(500), + AutoAcceptCertificates = true, + }; + using var drv = new OpcUaClientDriver(opts, "opcua-failover"); + + var ex = await Should.ThrowAsync(async () => + await drv.InitializeAsync("{}", TestContext.Current.CancellationToken)); + + ex.Message.ShouldContain("127.0.0.1:1"); + ex.Message.ShouldContain("127.0.0.1:2"); + ex.Message.ShouldContain("127.0.0.1:3"); + drv.GetHealth().State.ShouldBe(DriverState.Faulted); + } +} -- 2.49.1