Phase 3 PR 72 -- Multi-endpoint failover for OPC UA Client driver. Adds OpcUaClientDriverOptions.EndpointUrls ordered list + PerEndpointConnectTimeout knob. On InitializeAsync the driver walks the candidate list in order via ResolveEndpointCandidates and returns the session from the first endpoint that successfully connects. Captures per-URL failure reasons in a List<string> and, if every candidate fails, throws AggregateException whose message names every URL + its failure class (e.g. 'opc.tcp://primary:4840 -> TimeoutException: ...'). That's critical diag for field debugging -- without it 'failover picked the wrong one' surfaces as a mystery. Single-URL backwards compat: EndpointUrl field retained as a one-URL shortcut. When EndpointUrls is null or empty the driver falls through to a single-candidate list of [EndpointUrl], so every existing single-endpoint config keeps working without migration. When both are provided, EndpointUrls wins + EndpointUrl is ignored -- documented on the field xml-doc. Per-endpoint connect budget: PerEndpointConnectTimeout (default 3s) caps each attempt so a sweep over several dead servers can't blow the overall init budget. Applied via CancellationTokenSource.CreateLinkedTokenSource + CancelAfter inside OpenSessionOnEndpointAsync (the extracted single-endpoint connect helper) so the cap is independent of the outer Options.Timeout which governs steady-state ops. BuildUserIdentity extracted out of InitializeAsync so the failover loop builds the UserIdentity ONCE and reuses it across every endpoint attempt -- generating it N times would re-unlock the user cert's private key N times, wasteful + keeps the password in memory longer. HostName now reflects the endpoint that actually connected via _connectedEndpointUrl instead of always returning opts.EndpointUrl -- so the Admin /hosts dashboard shows which of the configured endpoints is currently serving traffic (primary vs backup). Falls back to the first candidate pre-connect so the dashboard has a sensible identity before the first connect, and resets to null on ShutdownAsync. Use case: an OPC UA hot-standby server pair (primary 4840 + backup 4841) where either can serve the same address space. Operator configures EndpointUrls=[primary, backup]; driver tries primary first, falls over to backup on primary failure with a clean AggregateException describing both attempts if both are down. Unit tests (OpcUaClientFailoverTests, 5 facts): ResolveEndpointCandidates_prefers_EndpointUrls_when_provided (list trumps single), ResolveEndpointCandidates_falls_back_to_single_EndpointUrl_when_list_empty (legacy config compat), ResolveEndpointCandidates_empty_list_treated_as_fallback (explicit empty list also falls back -- otherwise we'd produce a zero-candidate sweep that throws with nothing tried), HostName_uses_first_candidate_before_connect (dashboard rendering pre-connect), Initialize_against_all_unreachable_endpoints_throws_AggregateException_listing_each (three loopback dead ports, asserts each URL appears in the aggregate message + driver flips to Faulted). 31/31 OpcUaClient.Tests pass. dotnet build clean. OPC UA Client driver security/auth/availability feature set now complete per driver-specs.md \u00A78: policy-filtered endpoint selection (PR 70), Anonymous+Username+Certificate auth (PR 71), multi-endpoint failover (this PR).
This commit is contained in:
@@ -59,6 +59,8 @@ public sealed class OpcUaClientDriver(OpcUaClientDriverOptions options, string d
|
||||
|
||||
private DriverHealth _health = new(DriverState.Unknown, null, null);
|
||||
private bool _disposed;
|
||||
/// <summary>URL of the endpoint the driver actually connected to. Exposed via <see cref="HostName"/>.</summary>
|
||||
private string? _connectedEndpointUrl;
|
||||
|
||||
public string DriverInstanceId => driverInstanceId;
|
||||
public string DriverType => "OpcUaClient";
|
||||
@@ -69,51 +71,38 @@ public sealed class OpcUaClientDriver(OpcUaClientDriverOptions options, string d
|
||||
try
|
||||
{
|
||||
var appConfig = await BuildApplicationConfigurationAsync(cancellationToken).ConfigureAwait(false);
|
||||
var candidates = ResolveEndpointCandidates(_options);
|
||||
|
||||
// Endpoint selection: let the stack pick the best matching endpoint for the
|
||||
// requested security policy/mode so the driver doesn't have to hand-validate.
|
||||
// UseSecurity=false when SecurityMode=None shortcuts around cert validation
|
||||
// entirely and is the typical dev-bench configuration.
|
||||
var selected = await SelectMatchingEndpointAsync(
|
||||
appConfig, _options.EndpointUrl, _options.SecurityPolicy, _options.SecurityMode,
|
||||
cancellationToken).ConfigureAwait(false);
|
||||
var endpointConfig = EndpointConfiguration.Create(appConfig);
|
||||
endpointConfig.OperationTimeout = (int)_options.Timeout.TotalMilliseconds;
|
||||
var endpoint = new ConfiguredEndpoint(null, selected, endpointConfig);
|
||||
var identity = BuildUserIdentity(_options);
|
||||
|
||||
var identity = _options.AuthType switch
|
||||
// Failover sweep: try each endpoint in order, return the session from the first
|
||||
// one that successfully connects. Per-endpoint failures are captured so the final
|
||||
// aggregate exception names every URL that was tried and why — critical diag for
|
||||
// operators debugging 'why did the failover pick #3?'.
|
||||
var attemptErrors = new List<string>(candidates.Count);
|
||||
ISession? session = null;
|
||||
string? connectedUrl = null;
|
||||
foreach (var url in candidates)
|
||||
{
|
||||
OpcUaAuthType.Anonymous => new UserIdentity(new AnonymousIdentityToken()),
|
||||
// The UserIdentity(string, string) overload was removed in favour of
|
||||
// (string, byte[]) to make the password encoding explicit. UTF-8 is the
|
||||
// overwhelmingly common choice for Basic256Sha256-secured sessions.
|
||||
OpcUaAuthType.Username => new UserIdentity(
|
||||
_options.Username ?? string.Empty,
|
||||
System.Text.Encoding.UTF8.GetBytes(_options.Password ?? string.Empty)),
|
||||
OpcUaAuthType.Certificate => BuildCertificateIdentity(_options),
|
||||
_ => new UserIdentity(new AnonymousIdentityToken()),
|
||||
};
|
||||
try
|
||||
{
|
||||
session = await OpenSessionOnEndpointAsync(
|
||||
appConfig, url, _options.SecurityPolicy, _options.SecurityMode,
|
||||
identity, cancellationToken).ConfigureAwait(false);
|
||||
connectedUrl = url;
|
||||
break;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
attemptErrors.Add($"{url} -> {ex.GetType().Name}: {ex.Message}");
|
||||
}
|
||||
}
|
||||
|
||||
// All Session.Create* static methods are marked [Obsolete] in SDK 1.5.378; the
|
||||
// non-obsolete path is DefaultSessionFactory.Instance.CreateAsync (which is the
|
||||
// 8-arg signature matching our driver config — ApplicationConfiguration +
|
||||
// ConfiguredEndpoint, no transport-waiting-connection or reverse-connect-manager
|
||||
// required for the standard opc.tcp direct-connect case).
|
||||
// DefaultSessionFactory's parameterless ctor is also obsolete in 1.5.378; the
|
||||
// current constructor requires an ITelemetryContext. Passing null is tolerated —
|
||||
// the factory falls back to its internal default sink, same as the telemetry:null
|
||||
// on SelectEndpointAsync above.
|
||||
var session = await new DefaultSessionFactory(telemetry: null!).CreateAsync(
|
||||
appConfig,
|
||||
endpoint,
|
||||
false, // updateBeforeConnect
|
||||
_options.SessionName,
|
||||
(uint)_options.SessionTimeout.TotalMilliseconds,
|
||||
identity,
|
||||
null, // preferredLocales
|
||||
cancellationToken).ConfigureAwait(false);
|
||||
|
||||
session.KeepAliveInterval = (int)_options.KeepAliveInterval.TotalMilliseconds;
|
||||
if (session is null)
|
||||
throw new AggregateException(
|
||||
"OPC UA Client failed to connect to any of the configured endpoints. " +
|
||||
"Tried:\n " + string.Join("\n ", attemptErrors),
|
||||
attemptErrors.Select(e => new InvalidOperationException(e)));
|
||||
|
||||
// Wire the session's keep-alive channel into HostState. OPC UA keep-alives are
|
||||
// authoritative for session liveness: the SDK pings on KeepAliveInterval and sets
|
||||
@@ -128,6 +117,7 @@ public sealed class OpcUaClientDriver(OpcUaClientDriverOptions options, string d
|
||||
session.KeepAlive += _keepAliveHandler;
|
||||
|
||||
Session = session;
|
||||
_connectedEndpointUrl = connectedUrl;
|
||||
_health = new DriverHealth(DriverState.Healthy, DateTime.UtcNow, null);
|
||||
TransitionTo(HostState.Running);
|
||||
}
|
||||
@@ -224,6 +214,71 @@ public sealed class OpcUaClientDriver(OpcUaClientDriverOptions options, string d
|
||||
return config;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Resolve the ordered failover candidate list. <c>EndpointUrls</c> wins when
|
||||
/// non-empty; otherwise fall back to <c>EndpointUrl</c> as a single-URL shortcut so
|
||||
/// existing single-endpoint configs keep working without migration.
|
||||
/// </summary>
|
||||
internal static IReadOnlyList<string> ResolveEndpointCandidates(OpcUaClientDriverOptions opts)
|
||||
{
|
||||
if (opts.EndpointUrls is { Count: > 0 }) return opts.EndpointUrls;
|
||||
return [opts.EndpointUrl];
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Build the user-identity token from the driver options. Split out of
|
||||
/// <see cref="InitializeAsync"/> so the failover sweep reuses one identity across
|
||||
/// every endpoint attempt — generating it N times would re-unlock the user cert's
|
||||
/// private key N times, wasteful + keeps the password in memory longer.
|
||||
/// </summary>
|
||||
internal static UserIdentity BuildUserIdentity(OpcUaClientDriverOptions options) =>
|
||||
options.AuthType switch
|
||||
{
|
||||
OpcUaAuthType.Anonymous => new UserIdentity(new AnonymousIdentityToken()),
|
||||
OpcUaAuthType.Username => new UserIdentity(
|
||||
options.Username ?? string.Empty,
|
||||
System.Text.Encoding.UTF8.GetBytes(options.Password ?? string.Empty)),
|
||||
OpcUaAuthType.Certificate => BuildCertificateIdentity(options),
|
||||
_ => new UserIdentity(new AnonymousIdentityToken()),
|
||||
};
|
||||
|
||||
/// <summary>
|
||||
/// Open a session against a single endpoint URL. Bounded by
|
||||
/// <see cref="OpcUaClientDriverOptions.PerEndpointConnectTimeout"/> so the failover
|
||||
/// sweep doesn't spend its full budget on one dead server. Moved out of
|
||||
/// <see cref="InitializeAsync"/> so the failover loop body stays readable.
|
||||
/// </summary>
|
||||
private async Task<ISession> OpenSessionOnEndpointAsync(
|
||||
ApplicationConfiguration appConfig,
|
||||
string endpointUrl,
|
||||
OpcUaSecurityPolicy policy,
|
||||
OpcUaSecurityMode mode,
|
||||
UserIdentity identity,
|
||||
CancellationToken ct)
|
||||
{
|
||||
using var cts = CancellationTokenSource.CreateLinkedTokenSource(ct);
|
||||
cts.CancelAfter(_options.PerEndpointConnectTimeout);
|
||||
|
||||
var selected = await SelectMatchingEndpointAsync(
|
||||
appConfig, endpointUrl, policy, mode, cts.Token).ConfigureAwait(false);
|
||||
var endpointConfig = EndpointConfiguration.Create(appConfig);
|
||||
endpointConfig.OperationTimeout = (int)_options.Timeout.TotalMilliseconds;
|
||||
var endpoint = new ConfiguredEndpoint(null, selected, endpointConfig);
|
||||
|
||||
var session = await new DefaultSessionFactory(telemetry: null!).CreateAsync(
|
||||
appConfig,
|
||||
endpoint,
|
||||
false, // updateBeforeConnect
|
||||
_options.SessionName,
|
||||
(uint)_options.SessionTimeout.TotalMilliseconds,
|
||||
identity,
|
||||
null, // preferredLocales
|
||||
cts.Token).ConfigureAwait(false);
|
||||
|
||||
session.KeepAliveInterval = (int)_options.KeepAliveInterval.TotalMilliseconds;
|
||||
return session;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Select the remote endpoint matching both the requested <paramref name="policy"/>
|
||||
/// and <paramref name="mode"/>. The SDK's <c>CoreClientUtils.SelectEndpointAsync</c>
|
||||
@@ -347,6 +402,7 @@ public sealed class OpcUaClientDriver(OpcUaClientDriverOptions options, string d
|
||||
catch { /* best-effort */ }
|
||||
try { Session?.Dispose(); } catch { }
|
||||
Session = null;
|
||||
_connectedEndpointUrl = null;
|
||||
|
||||
TransitionTo(HostState.Unknown);
|
||||
_health = new DriverHealth(DriverState.Unknown, _health.LastSuccessfulRead, null);
|
||||
@@ -718,8 +774,16 @@ public sealed class OpcUaClientDriver(OpcUaClientDriverOptions options, string d
|
||||
|
||||
// ---- IHostConnectivityProbe ----
|
||||
|
||||
/// <summary>Endpoint-URL-keyed host identity for the Admin /hosts dashboard.</summary>
|
||||
public string HostName => _options.EndpointUrl;
|
||||
/// <summary>
|
||||
/// Endpoint-URL-keyed host identity for the Admin /hosts dashboard. Reflects the
|
||||
/// endpoint the driver actually connected to after the failover sweep — not the
|
||||
/// first URL in the candidate list — so operators see which of the configured
|
||||
/// endpoints is currently serving traffic. Falls back to the first configured URL
|
||||
/// pre-init so the dashboard has something to render before the first connect.
|
||||
/// </summary>
|
||||
public string HostName => _connectedEndpointUrl
|
||||
?? ResolveEndpointCandidates(_options).FirstOrDefault()
|
||||
?? _options.EndpointUrl;
|
||||
|
||||
public IReadOnlyList<HostConnectivityStatus> GetHostStatuses()
|
||||
{
|
||||
|
||||
Reference in New Issue
Block a user