Phase 3 PR 74 -- OPC UA Client transparent reconnect via SessionReconnectHandler. Before this PR a session keep-alive failure flipped HostState to Stopped and stayed there until operator intervention. PR 74 wires the SDK's SessionReconnectHandler so the driver automatically retries + swaps in a new session when the upstream server comes back. New _reconnectHandler field lazily instantiated inside OnKeepAlive on a bad status; subsequent bad keep-alives during the same outage no-op (null-check prevents stacked handlers). Constructor uses (telemetry:null, reconnectAbort:false, maxReconnectPeriod:2min) -- reconnectAbort=false so the handler keeps trying across many retry cycles; 2min cap prevents pathological back-off from starving operator visibility. BeginReconnect takes the current ISession + ReconnectPeriod (from OpcUaClientDriverOptions, default 5s per driver-specs.md \u00A78) + our OnReconnectComplete callback. OnReconnectComplete reads handler.Session for the new session, unwires keepalive from the dead session, rewires to the new session (without this the NEXT drop wouldn't trigger another reconnect -- subtle and critical), swaps Session, disposes the handler. The SDK's Session.TransferSubscriptionsOnReconnect default=true handles subscription migration internally so local MonitoredItem handles stay live across the reconnect; no driver-side manual transfer needed. Shutdown path now aborts any in-flight reconnect via _reconnectHandler.CancelReconnect() + Dispose BEFORE touching Session.CloseAsync -- without this the handler's retry loop holds a reference to the about-to-close session and fights the close, producing BadSessionIdInvalid noise in the upstream log and potential disposal-race exceptions. Cancel-first is the documented SDK pattern. Kept the driver's own HostState/OnHostStatusChanged flow: bad keep-alive -> Stopped transition + reconnect kicks off; OnReconnectComplete -> Running transition + Healthy status. Downstream consumers see the bounce as Stopped->Running without needing to know about the reconnect handler internals. Unit tests (OpcUaClientReconnectTests, 3 facts): Default_ReconnectPeriod_matches_driver_specs_5_seconds (sanity check on the options default), Options_ReconnectPeriod_is_configurable_for_aggressive_or_relaxed_retry (500ms override works), Driver_starts_with_no_reconnect_handler_active_pre_init (lazy instantiation -- indirectly via lifecycle). Wire-level disconnect-reconnect-resume coverage against a live upstream server is deferred to the in-process-fixture PR -- testing the reconnect path needs a server we can kill + revive mid-test, non-trivial to scaffold in xUnit. 54/54 OpcUaClient.Tests pass (51 prior + 3 reconnect). dotnet build clean.

This commit is contained in:
Joseph Doherty
2026-04-19 02:04:42 -04:00
parent 8cd932e7c9
commit ba3a5598e1
2 changed files with 126 additions and 10 deletions

View File

@@ -61,6 +61,12 @@ public sealed class OpcUaClientDriver(OpcUaClientDriverOptions options, string d
private bool _disposed;
/// <summary>URL of the endpoint the driver actually connected to. Exposed via <see cref="HostName"/>.</summary>
private string? _connectedEndpointUrl;
/// <summary>
/// SDK-provided reconnect handler that owns the retry loop + session-transfer machinery
/// when the session's keep-alive channel reports a bad status. Null outside the
/// reconnecting window; constructed lazily inside the keep-alive handler.
/// </summary>
private SessionReconnectHandler? _reconnectHandler;
public string DriverInstanceId => driverInstanceId;
public string DriverType => "OpcUaClient";
@@ -104,16 +110,13 @@ public sealed class OpcUaClientDriver(OpcUaClientDriverOptions options, string d
"Tried:\n " + string.Join("\n ", attemptErrors),
attemptErrors.Select(e => new InvalidOperationException(e)));
// Wire the session's keep-alive channel into HostState. OPC UA keep-alives are
// authoritative for session liveness: the SDK pings on KeepAliveInterval and sets
// KeepAliveStopped when N intervals elapse without a response. That's strictly
// better than a driver-side polling probe — no extra round-trip, no duplicate
// semantic.
_keepAliveHandler = (_, e) =>
{
var healthy = !ServiceResult.IsBad(e.Status);
TransitionTo(healthy ? HostState.Running : HostState.Stopped);
};
// Wire the session's keep-alive channel into HostState + the reconnect trigger.
// OPC UA keep-alives are authoritative for session liveness: the SDK pings on
// KeepAliveInterval and sets KeepAliveStopped when N intervals elapse without a
// response. On a bad keep-alive the driver spins up a SessionReconnectHandler
// which transparently retries + swaps the underlying session. Subscriptions move
// via TransferSubscriptions so local MonitoredItem handles stay valid.
_keepAliveHandler = OnKeepAlive;
session.KeepAlive += _keepAliveHandler;
Session = session;
@@ -392,6 +395,13 @@ public sealed class OpcUaClientDriver(OpcUaClientDriverOptions options, string d
}
_subscriptions.Clear();
// Abort any in-flight reconnect attempts before touching the session — BeginReconnect's
// retry loop holds a reference to the current session and would fight Session.CloseAsync
// if left spinning.
try { _reconnectHandler?.CancelReconnect(); } catch { }
_reconnectHandler?.Dispose();
_reconnectHandler = null;
if (_keepAliveHandler is not null && Session is not null)
{
try { Session.KeepAlive -= _keepAliveHandler; } catch { }
@@ -945,6 +955,76 @@ public sealed class OpcUaClientDriver(OpcUaClientDriverOptions options, string d
return [new HostConnectivityStatus(HostName, _hostState, _hostStateChangedUtc)];
}
/// <summary>
/// Session keep-alive handler. On a healthy ping, bumps HostState back to Running
/// (typical bounce after a transient network blip). On a bad ping, starts the SDK's
/// <see cref="SessionReconnectHandler"/> which retries on the configured period +
/// fires <see cref="OnReconnectComplete"/> when it lands a new session.
/// </summary>
private void OnKeepAlive(ISession sender, KeepAliveEventArgs e)
{
if (!ServiceResult.IsBad(e.Status))
{
TransitionTo(HostState.Running);
return;
}
TransitionTo(HostState.Stopped);
// Kick off the SDK's reconnect loop exactly once per drop. The handler handles its
// own retry cadence via ReconnectPeriod; we tear it down in OnReconnectComplete.
if (_reconnectHandler is not null) return;
_reconnectHandler = new SessionReconnectHandler(telemetry: null!,
reconnectAbort: false,
maxReconnectPeriod: (int)TimeSpan.FromMinutes(2).TotalMilliseconds);
var state = _reconnectHandler.BeginReconnect(
sender,
(int)_options.ReconnectPeriod.TotalMilliseconds,
OnReconnectComplete);
}
/// <summary>
/// Called by <see cref="SessionReconnectHandler"/> when its retry loop has either
/// successfully swapped to a new session or given up. Reads the new session off
/// <c>handler.Session</c>, unwires the old keep-alive hook, rewires for the new
/// one, and tears down the handler. Subscription migration is already handled
/// inside the SDK via <c>TransferSubscriptions</c> (the SDK calls it automatically
/// when <see cref="Session.TransferSubscriptionsOnReconnect"/> is <c>true</c>,
/// which is the default).
/// </summary>
private void OnReconnectComplete(object? sender, EventArgs e)
{
if (sender is not SessionReconnectHandler handler) return;
var newSession = handler.Session;
var oldSession = Session;
// Rewire keep-alive onto the new session — without this the next drop wouldn't
// trigger another reconnect attempt.
if (oldSession is not null && _keepAliveHandler is not null)
{
try { oldSession.KeepAlive -= _keepAliveHandler; } catch { }
}
if (newSession is not null && _keepAliveHandler is not null)
{
newSession.KeepAlive += _keepAliveHandler;
}
Session = newSession;
_reconnectHandler?.Dispose();
_reconnectHandler = null;
// Whether the reconnect actually succeeded depends on whether the session is
// non-null + connected. When it succeeded, flip back to Running so downstream
// consumers see recovery.
if (newSession is not null)
{
TransitionTo(HostState.Running);
_health = new DriverHealth(DriverState.Healthy, DateTime.UtcNow, null);
}
}
private void TransitionTo(HostState newState)
{
HostState old;