Phase 3 PR 72 -- Multi-endpoint failover for OPC UA Client #71
@@ -59,6 +59,8 @@ public sealed class OpcUaClientDriver(OpcUaClientDriverOptions options, string d
|
|||||||
|
|
||||||
private DriverHealth _health = new(DriverState.Unknown, null, null);
|
private DriverHealth _health = new(DriverState.Unknown, null, null);
|
||||||
private bool _disposed;
|
private bool _disposed;
|
||||||
|
/// <summary>URL of the endpoint the driver actually connected to. Exposed via <see cref="HostName"/>.</summary>
|
||||||
|
private string? _connectedEndpointUrl;
|
||||||
|
|
||||||
public string DriverInstanceId => driverInstanceId;
|
public string DriverInstanceId => driverInstanceId;
|
||||||
public string DriverType => "OpcUaClient";
|
public string DriverType => "OpcUaClient";
|
||||||
@@ -69,51 +71,38 @@ public sealed class OpcUaClientDriver(OpcUaClientDriverOptions options, string d
|
|||||||
try
|
try
|
||||||
{
|
{
|
||||||
var appConfig = await BuildApplicationConfigurationAsync(cancellationToken).ConfigureAwait(false);
|
var appConfig = await BuildApplicationConfigurationAsync(cancellationToken).ConfigureAwait(false);
|
||||||
|
var candidates = ResolveEndpointCandidates(_options);
|
||||||
|
|
||||||
// Endpoint selection: let the stack pick the best matching endpoint for the
|
var identity = BuildUserIdentity(_options);
|
||||||
// requested security policy/mode so the driver doesn't have to hand-validate.
|
|
||||||
// UseSecurity=false when SecurityMode=None shortcuts around cert validation
|
|
||||||
// entirely and is the typical dev-bench configuration.
|
|
||||||
var selected = await SelectMatchingEndpointAsync(
|
|
||||||
appConfig, _options.EndpointUrl, _options.SecurityPolicy, _options.SecurityMode,
|
|
||||||
cancellationToken).ConfigureAwait(false);
|
|
||||||
var endpointConfig = EndpointConfiguration.Create(appConfig);
|
|
||||||
endpointConfig.OperationTimeout = (int)_options.Timeout.TotalMilliseconds;
|
|
||||||
var endpoint = new ConfiguredEndpoint(null, selected, endpointConfig);
|
|
||||||
|
|
||||||
var identity = _options.AuthType switch
|
// Failover sweep: try each endpoint in order, return the session from the first
|
||||||
|
// one that successfully connects. Per-endpoint failures are captured so the final
|
||||||
|
// aggregate exception names every URL that was tried and why — critical diag for
|
||||||
|
// operators debugging 'why did the failover pick #3?'.
|
||||||
|
var attemptErrors = new List<string>(candidates.Count);
|
||||||
|
ISession? session = null;
|
||||||
|
string? connectedUrl = null;
|
||||||
|
foreach (var url in candidates)
|
||||||
{
|
{
|
||||||
OpcUaAuthType.Anonymous => new UserIdentity(new AnonymousIdentityToken()),
|
try
|
||||||
// The UserIdentity(string, string) overload was removed in favour of
|
{
|
||||||
// (string, byte[]) to make the password encoding explicit. UTF-8 is the
|
session = await OpenSessionOnEndpointAsync(
|
||||||
// overwhelmingly common choice for Basic256Sha256-secured sessions.
|
appConfig, url, _options.SecurityPolicy, _options.SecurityMode,
|
||||||
OpcUaAuthType.Username => new UserIdentity(
|
identity, cancellationToken).ConfigureAwait(false);
|
||||||
_options.Username ?? string.Empty,
|
connectedUrl = url;
|
||||||
System.Text.Encoding.UTF8.GetBytes(_options.Password ?? string.Empty)),
|
break;
|
||||||
OpcUaAuthType.Certificate => BuildCertificateIdentity(_options),
|
}
|
||||||
_ => new UserIdentity(new AnonymousIdentityToken()),
|
catch (Exception ex)
|
||||||
};
|
{
|
||||||
|
attemptErrors.Add($"{url} -> {ex.GetType().Name}: {ex.Message}");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// All Session.Create* static methods are marked [Obsolete] in SDK 1.5.378; the
|
if (session is null)
|
||||||
// non-obsolete path is DefaultSessionFactory.Instance.CreateAsync (which is the
|
throw new AggregateException(
|
||||||
// 8-arg signature matching our driver config — ApplicationConfiguration +
|
"OPC UA Client failed to connect to any of the configured endpoints. " +
|
||||||
// ConfiguredEndpoint, no transport-waiting-connection or reverse-connect-manager
|
"Tried:\n " + string.Join("\n ", attemptErrors),
|
||||||
// required for the standard opc.tcp direct-connect case).
|
attemptErrors.Select(e => new InvalidOperationException(e)));
|
||||||
// DefaultSessionFactory's parameterless ctor is also obsolete in 1.5.378; the
|
|
||||||
// current constructor requires an ITelemetryContext. Passing null is tolerated —
|
|
||||||
// the factory falls back to its internal default sink, same as the telemetry:null
|
|
||||||
// on SelectEndpointAsync above.
|
|
||||||
var session = await new DefaultSessionFactory(telemetry: null!).CreateAsync(
|
|
||||||
appConfig,
|
|
||||||
endpoint,
|
|
||||||
false, // updateBeforeConnect
|
|
||||||
_options.SessionName,
|
|
||||||
(uint)_options.SessionTimeout.TotalMilliseconds,
|
|
||||||
identity,
|
|
||||||
null, // preferredLocales
|
|
||||||
cancellationToken).ConfigureAwait(false);
|
|
||||||
|
|
||||||
session.KeepAliveInterval = (int)_options.KeepAliveInterval.TotalMilliseconds;
|
|
||||||
|
|
||||||
// Wire the session's keep-alive channel into HostState. OPC UA keep-alives are
|
// Wire the session's keep-alive channel into HostState. OPC UA keep-alives are
|
||||||
// authoritative for session liveness: the SDK pings on KeepAliveInterval and sets
|
// authoritative for session liveness: the SDK pings on KeepAliveInterval and sets
|
||||||
@@ -128,6 +117,7 @@ public sealed class OpcUaClientDriver(OpcUaClientDriverOptions options, string d
|
|||||||
session.KeepAlive += _keepAliveHandler;
|
session.KeepAlive += _keepAliveHandler;
|
||||||
|
|
||||||
Session = session;
|
Session = session;
|
||||||
|
_connectedEndpointUrl = connectedUrl;
|
||||||
_health = new DriverHealth(DriverState.Healthy, DateTime.UtcNow, null);
|
_health = new DriverHealth(DriverState.Healthy, DateTime.UtcNow, null);
|
||||||
TransitionTo(HostState.Running);
|
TransitionTo(HostState.Running);
|
||||||
}
|
}
|
||||||
@@ -224,6 +214,71 @@ public sealed class OpcUaClientDriver(OpcUaClientDriverOptions options, string d
|
|||||||
return config;
|
return config;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Resolve the ordered failover candidate list. <c>EndpointUrls</c> wins when
|
||||||
|
/// non-empty; otherwise fall back to <c>EndpointUrl</c> as a single-URL shortcut so
|
||||||
|
/// existing single-endpoint configs keep working without migration.
|
||||||
|
/// </summary>
|
||||||
|
internal static IReadOnlyList<string> ResolveEndpointCandidates(OpcUaClientDriverOptions opts)
|
||||||
|
{
|
||||||
|
if (opts.EndpointUrls is { Count: > 0 }) return opts.EndpointUrls;
|
||||||
|
return [opts.EndpointUrl];
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Build the user-identity token from the driver options. Split out of
|
||||||
|
/// <see cref="InitializeAsync"/> so the failover sweep reuses one identity across
|
||||||
|
/// every endpoint attempt — generating it N times would re-unlock the user cert's
|
||||||
|
/// private key N times, wasteful + keeps the password in memory longer.
|
||||||
|
/// </summary>
|
||||||
|
internal static UserIdentity BuildUserIdentity(OpcUaClientDriverOptions options) =>
|
||||||
|
options.AuthType switch
|
||||||
|
{
|
||||||
|
OpcUaAuthType.Anonymous => new UserIdentity(new AnonymousIdentityToken()),
|
||||||
|
OpcUaAuthType.Username => new UserIdentity(
|
||||||
|
options.Username ?? string.Empty,
|
||||||
|
System.Text.Encoding.UTF8.GetBytes(options.Password ?? string.Empty)),
|
||||||
|
OpcUaAuthType.Certificate => BuildCertificateIdentity(options),
|
||||||
|
_ => new UserIdentity(new AnonymousIdentityToken()),
|
||||||
|
};
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Open a session against a single endpoint URL. Bounded by
|
||||||
|
/// <see cref="OpcUaClientDriverOptions.PerEndpointConnectTimeout"/> so the failover
|
||||||
|
/// sweep doesn't spend its full budget on one dead server. Moved out of
|
||||||
|
/// <see cref="InitializeAsync"/> so the failover loop body stays readable.
|
||||||
|
/// </summary>
|
||||||
|
private async Task<ISession> OpenSessionOnEndpointAsync(
|
||||||
|
ApplicationConfiguration appConfig,
|
||||||
|
string endpointUrl,
|
||||||
|
OpcUaSecurityPolicy policy,
|
||||||
|
OpcUaSecurityMode mode,
|
||||||
|
UserIdentity identity,
|
||||||
|
CancellationToken ct)
|
||||||
|
{
|
||||||
|
using var cts = CancellationTokenSource.CreateLinkedTokenSource(ct);
|
||||||
|
cts.CancelAfter(_options.PerEndpointConnectTimeout);
|
||||||
|
|
||||||
|
var selected = await SelectMatchingEndpointAsync(
|
||||||
|
appConfig, endpointUrl, policy, mode, cts.Token).ConfigureAwait(false);
|
||||||
|
var endpointConfig = EndpointConfiguration.Create(appConfig);
|
||||||
|
endpointConfig.OperationTimeout = (int)_options.Timeout.TotalMilliseconds;
|
||||||
|
var endpoint = new ConfiguredEndpoint(null, selected, endpointConfig);
|
||||||
|
|
||||||
|
var session = await new DefaultSessionFactory(telemetry: null!).CreateAsync(
|
||||||
|
appConfig,
|
||||||
|
endpoint,
|
||||||
|
false, // updateBeforeConnect
|
||||||
|
_options.SessionName,
|
||||||
|
(uint)_options.SessionTimeout.TotalMilliseconds,
|
||||||
|
identity,
|
||||||
|
null, // preferredLocales
|
||||||
|
cts.Token).ConfigureAwait(false);
|
||||||
|
|
||||||
|
session.KeepAliveInterval = (int)_options.KeepAliveInterval.TotalMilliseconds;
|
||||||
|
return session;
|
||||||
|
}
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Select the remote endpoint matching both the requested <paramref name="policy"/>
|
/// Select the remote endpoint matching both the requested <paramref name="policy"/>
|
||||||
/// and <paramref name="mode"/>. The SDK's <c>CoreClientUtils.SelectEndpointAsync</c>
|
/// and <paramref name="mode"/>. The SDK's <c>CoreClientUtils.SelectEndpointAsync</c>
|
||||||
@@ -347,6 +402,7 @@ public sealed class OpcUaClientDriver(OpcUaClientDriverOptions options, string d
|
|||||||
catch { /* best-effort */ }
|
catch { /* best-effort */ }
|
||||||
try { Session?.Dispose(); } catch { }
|
try { Session?.Dispose(); } catch { }
|
||||||
Session = null;
|
Session = null;
|
||||||
|
_connectedEndpointUrl = null;
|
||||||
|
|
||||||
TransitionTo(HostState.Unknown);
|
TransitionTo(HostState.Unknown);
|
||||||
_health = new DriverHealth(DriverState.Unknown, _health.LastSuccessfulRead, null);
|
_health = new DriverHealth(DriverState.Unknown, _health.LastSuccessfulRead, null);
|
||||||
@@ -718,8 +774,16 @@ public sealed class OpcUaClientDriver(OpcUaClientDriverOptions options, string d
|
|||||||
|
|
||||||
// ---- IHostConnectivityProbe ----
|
// ---- IHostConnectivityProbe ----
|
||||||
|
|
||||||
/// <summary>Endpoint-URL-keyed host identity for the Admin /hosts dashboard.</summary>
|
/// <summary>
|
||||||
public string HostName => _options.EndpointUrl;
|
/// Endpoint-URL-keyed host identity for the Admin /hosts dashboard. Reflects the
|
||||||
|
/// endpoint the driver actually connected to after the failover sweep — not the
|
||||||
|
/// first URL in the candidate list — so operators see which of the configured
|
||||||
|
/// endpoints is currently serving traffic. Falls back to the first configured URL
|
||||||
|
/// pre-init so the dashboard has something to render before the first connect.
|
||||||
|
/// </summary>
|
||||||
|
public string HostName => _connectedEndpointUrl
|
||||||
|
?? ResolveEndpointCandidates(_options).FirstOrDefault()
|
||||||
|
?? _options.EndpointUrl;
|
||||||
|
|
||||||
public IReadOnlyList<HostConnectivityStatus> GetHostStatuses()
|
public IReadOnlyList<HostConnectivityStatus> GetHostStatuses()
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -13,9 +13,32 @@ namespace ZB.MOM.WW.OtOpcUa.Driver.OpcUaClient;
|
|||||||
/// </remarks>
|
/// </remarks>
|
||||||
public sealed class OpcUaClientDriverOptions
|
public sealed class OpcUaClientDriverOptions
|
||||||
{
|
{
|
||||||
/// <summary>Remote OPC UA endpoint URL, e.g. <c>opc.tcp://plc.internal:4840</c>.</summary>
|
/// <summary>
|
||||||
|
/// Remote OPC UA endpoint URL, e.g. <c>opc.tcp://plc.internal:4840</c>. Convenience
|
||||||
|
/// shortcut for a single-endpoint deployment — equivalent to setting
|
||||||
|
/// <see cref="EndpointUrls"/> to a list with this one URL. When both are provided,
|
||||||
|
/// the list wins and <see cref="EndpointUrl"/> is ignored.
|
||||||
|
/// </summary>
|
||||||
public string EndpointUrl { get; init; } = "opc.tcp://localhost:4840";
|
public string EndpointUrl { get; init; } = "opc.tcp://localhost:4840";
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Ordered list of candidate endpoint URLs for failover. The driver tries each in
|
||||||
|
/// order at <see cref="OpcUaClientDriver.InitializeAsync"/> and on session drop;
|
||||||
|
/// the first URL that successfully connects wins. Typical use-case: an OPC UA server
|
||||||
|
/// pair running in hot-standby (primary 4840 + backup 4841) where either can serve
|
||||||
|
/// the same address space. Leave unset (or empty) to use <see cref="EndpointUrl"/>
|
||||||
|
/// as a single-URL shortcut.
|
||||||
|
/// </summary>
|
||||||
|
public IReadOnlyList<string> EndpointUrls { get; init; } = [];
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Per-endpoint connect-attempt timeout during the failover sweep. Short enough that
|
||||||
|
/// cycling through several dead servers doesn't blow the overall init budget, long
|
||||||
|
/// enough to tolerate a slow TLS handshake on a healthy server. Applied independently
|
||||||
|
/// of <see cref="Timeout"/> which governs steady-state operations.
|
||||||
|
/// </summary>
|
||||||
|
public TimeSpan PerEndpointConnectTimeout { get; init; } = TimeSpan.FromSeconds(3);
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Security policy to require when selecting an endpoint. Either a
|
/// Security policy to require when selecting an endpoint. Either a
|
||||||
/// <see cref="OpcUaSecurityPolicy"/> enum constant or a free-form string (for
|
/// <see cref="OpcUaSecurityPolicy"/> enum constant or a free-form string (for
|
||||||
|
|||||||
@@ -0,0 +1,81 @@
|
|||||||
|
using Shouldly;
|
||||||
|
using Xunit;
|
||||||
|
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||||
|
|
||||||
|
namespace ZB.MOM.WW.OtOpcUa.Driver.OpcUaClient.Tests;
|
||||||
|
|
||||||
|
[Trait("Category", "Unit")]
|
||||||
|
public sealed class OpcUaClientFailoverTests
|
||||||
|
{
|
||||||
|
[Fact]
|
||||||
|
public void ResolveEndpointCandidates_prefers_EndpointUrls_when_provided()
|
||||||
|
{
|
||||||
|
var opts = new OpcUaClientDriverOptions
|
||||||
|
{
|
||||||
|
EndpointUrl = "opc.tcp://fallback:4840",
|
||||||
|
EndpointUrls = ["opc.tcp://primary:4840", "opc.tcp://backup:4841"],
|
||||||
|
};
|
||||||
|
var list = OpcUaClientDriver.ResolveEndpointCandidates(opts);
|
||||||
|
list.Count.ShouldBe(2);
|
||||||
|
list[0].ShouldBe("opc.tcp://primary:4840");
|
||||||
|
list[1].ShouldBe("opc.tcp://backup:4841");
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public void ResolveEndpointCandidates_falls_back_to_single_EndpointUrl_when_list_empty()
|
||||||
|
{
|
||||||
|
var opts = new OpcUaClientDriverOptions { EndpointUrl = "opc.tcp://only:4840" };
|
||||||
|
var list = OpcUaClientDriver.ResolveEndpointCandidates(opts);
|
||||||
|
list.Count.ShouldBe(1);
|
||||||
|
list[0].ShouldBe("opc.tcp://only:4840");
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public void ResolveEndpointCandidates_empty_list_treated_as_fallback_to_EndpointUrl()
|
||||||
|
{
|
||||||
|
// Explicit empty list should still fall back to the single-URL shortcut rather than
|
||||||
|
// producing a zero-candidate sweep that would immediately throw with no URLs tried.
|
||||||
|
var opts = new OpcUaClientDriverOptions
|
||||||
|
{
|
||||||
|
EndpointUrl = "opc.tcp://single:4840",
|
||||||
|
EndpointUrls = [],
|
||||||
|
};
|
||||||
|
OpcUaClientDriver.ResolveEndpointCandidates(opts).Count.ShouldBe(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public void HostName_uses_first_candidate_before_connect()
|
||||||
|
{
|
||||||
|
var opts = new OpcUaClientDriverOptions
|
||||||
|
{
|
||||||
|
EndpointUrls = ["opc.tcp://primary:4840", "opc.tcp://backup:4841"],
|
||||||
|
};
|
||||||
|
using var drv = new OpcUaClientDriver(opts, "opcua-host");
|
||||||
|
drv.HostName.ShouldBe("opc.tcp://primary:4840",
|
||||||
|
"pre-connect the dashboard should show the first candidate URL so operators can link back");
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public async Task Initialize_against_all_unreachable_endpoints_throws_AggregateException_listing_each()
|
||||||
|
{
|
||||||
|
// Port 1 + port 2 + port 3 on loopback are all guaranteed closed (TCP RST immediate).
|
||||||
|
// Failover sweep should attempt all three and throw AggregateException naming each URL
|
||||||
|
// so operators see exactly which candidates were tried.
|
||||||
|
var opts = new OpcUaClientDriverOptions
|
||||||
|
{
|
||||||
|
EndpointUrls = ["opc.tcp://127.0.0.1:1", "opc.tcp://127.0.0.1:2", "opc.tcp://127.0.0.1:3"],
|
||||||
|
PerEndpointConnectTimeout = TimeSpan.FromMilliseconds(500),
|
||||||
|
Timeout = TimeSpan.FromMilliseconds(500),
|
||||||
|
AutoAcceptCertificates = true,
|
||||||
|
};
|
||||||
|
using var drv = new OpcUaClientDriver(opts, "opcua-failover");
|
||||||
|
|
||||||
|
var ex = await Should.ThrowAsync<AggregateException>(async () =>
|
||||||
|
await drv.InitializeAsync("{}", TestContext.Current.CancellationToken));
|
||||||
|
|
||||||
|
ex.Message.ShouldContain("127.0.0.1:1");
|
||||||
|
ex.Message.ShouldContain("127.0.0.1:2");
|
||||||
|
ex.Message.ShouldContain("127.0.0.1:3");
|
||||||
|
drv.GetHealth().State.ShouldBe(DriverState.Faulted);
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user