fix(galaxy): reconnect recreates a faulted session instead of no-op'ing

This commit is contained in:
Joseph Doherty
2026-06-11 09:10:52 -04:00
parent 4f291ed09c
commit 43b96441a5
3 changed files with 177 additions and 15 deletions
@@ -282,15 +282,17 @@ public sealed class GalaxyDriver
}
/// <summary>
/// Reopen callback for <see cref="ReconnectSupervisor"/>: re-Register the gw session.
/// If the session never connected, this is a fresh ConnectAsync; otherwise it's a
/// reconnect against the existing client.
/// Reopen callback for <see cref="ReconnectSupervisor"/>: recreate the gw session. After a
/// gateway restart the existing session handle is faulted / stale but non-null, so a plain
/// <c>ConnectAsync</c> would no-op and the supervisor would loop forever replaying against a
/// dead session. <see cref="GalaxyMxSession.RecreateAsync"/> disposes the stale session +
/// owned client and rebuilds, so a never-connected session still opens fresh.
/// </summary>
private async Task ReopenAsync(CancellationToken cancellationToken)
{
if (_ownedMxSession is null) return;
var clientOptions = BuildClientOptions(_options.Gateway);
await _ownedMxSession.ConnectAsync(clientOptions, cancellationToken).ConfigureAwait(false);
await _ownedMxSession.RecreateAsync(clientOptions, cancellationToken).ConfigureAwait(false);
}
/// <summary>
@@ -28,6 +28,7 @@ public sealed class GalaxyMxSession : IAsyncDisposable
private MxGatewayClient? _ownedClient;
private MxGatewaySession? _session;
private int _serverHandle;
private bool _connected;
private bool _disposed;
/// <summary>Initializes a new instance of the GalaxyMxSession class.</summary>
@@ -48,24 +49,65 @@ public sealed class GalaxyMxSession : IAsyncDisposable
/// </summary>
public int ServerHandle => _serverHandle;
/// <summary>
/// Test seam — when set, replaces the real "open session + register" work so unit tests can
/// exercise the connect / recreate orchestration without a live gateway (the SDK session/client
/// types are sealed + internal-ctor and cannot be faked). Null in production = real gateway path.
/// </summary>
internal Func<CancellationToken, Task>? OpenAndRegisterOverrideForTests { get; set; }
/// <summary>
/// Connect the underlying gateway client + open an MXAccess session + register the
/// configured client name. Idempotent — second calls are no-ops while
/// <see cref="IsConnected"/> is true.
/// configured client name. Idempotent — second calls are no-ops while a connect has
/// already succeeded. Use <see cref="RecreateAsync"/> to force a rebuild after the
/// gateway restarts and leaves a faulted / stale session handle.
/// </summary>
/// <param name="clientOptions">The MX gateway client options.</param>
/// <param name="cancellationToken">The cancellation token.</param>
public async Task ConnectAsync(MxGatewayClientOptions clientOptions, CancellationToken cancellationToken)
{
ObjectDisposedException.ThrowIf(_disposed, this);
if (_session is not null) return;
if (_connected) return;
_ownedClient = MxGatewayClient.Create(clientOptions);
_session = await _ownedClient.OpenSessionAsync(cancellationToken: cancellationToken).ConfigureAwait(false);
_serverHandle = await _session.RegisterAsync(_options.ClientName, cancellationToken).ConfigureAwait(false);
_logger.LogInformation(
"GalaxyMxSession connected — clientName={ClientName} serverHandle={Handle}",
_options.ClientName, _serverHandle);
try
{
if (OpenAndRegisterOverrideForTests is not null)
{
await OpenAndRegisterOverrideForTests(cancellationToken).ConfigureAwait(false);
}
else
{
_ownedClient = MxGatewayClient.Create(clientOptions);
_session = await _ownedClient.OpenSessionAsync(cancellationToken: cancellationToken).ConfigureAwait(false);
_serverHandle = await _session.RegisterAsync(_options.ClientName, cancellationToken).ConfigureAwait(false);
}
_connected = true;
_logger.LogInformation(
"GalaxyMxSession connected — clientName={ClientName} serverHandle={Handle}",
_options.ClientName, _serverHandle);
}
catch
{
// A partial open (e.g. OpenSession succeeded but Register threw) must not leave a half-open
// handle that blocks the next reconnect. Tear the partial state down so a retry rebuilds cleanly.
await TeardownSessionAsync().ConfigureAwait(false);
throw;
}
}
/// <summary>
/// Disposes the current (faulted / stale) session + owned client and rebuilds. Use this on
/// reconnect: <see cref="ConnectAsync"/> alone no-ops while a (possibly dead) session handle is
/// still present, so the supervisor's reopen must route through here to actually re-establish.
/// </summary>
/// <param name="clientOptions">The MX gateway client options.</param>
/// <param name="cancellationToken">The cancellation token.</param>
public async Task RecreateAsync(MxGatewayClientOptions clientOptions, CancellationToken cancellationToken)
{
ObjectDisposedException.ThrowIf(_disposed, this);
await TeardownSessionAsync().ConfigureAwait(false);
await ConnectAsync(clientOptions, cancellationToken).ConfigureAwait(false);
}
/// <summary>
@@ -93,19 +135,31 @@ public sealed class GalaxyMxSession : IAsyncDisposable
{
if (_disposed) return;
_disposed = true;
await TeardownSessionAsync().ConfigureAwait(false);
}
/// <summary>
/// Resets the connect flag and best-effort disposes + nulls the owned session and client.
/// Shared by <see cref="DisposeAsync"/> (final teardown) and <see cref="RecreateAsync"/> /
/// the partial-open recovery path (rebuild teardown) — it does NOT set <c>_disposed</c>, so
/// a torn-down session can be re-opened.
/// </summary>
private async Task TeardownSessionAsync()
{
_connected = false;
if (_session is not null)
{
try { await _session.DisposeAsync().ConfigureAwait(false); }
catch (Exception ex) { _logger.LogWarning(ex, "GalaxyMxSession session dispose failed (best-effort)"); }
_session = null;
}
_session = null;
if (_ownedClient is not null)
{
try { await _ownedClient.DisposeAsync().ConfigureAwait(false); }
catch (Exception ex) { _logger.LogWarning(ex, "GalaxyMxSession client dispose failed (best-effort)"); }
_ownedClient = null;
}
_ownedClient = null;
}
}
@@ -0,0 +1,106 @@
using Shouldly;
using Xunit;
using ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Config;
using ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Runtime;
namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Tests.Runtime;
/// <summary>
/// Connect / recreate orchestration tests for <see cref="GalaxyMxSession"/>. The SDK
/// session/client types are sealed with internal ctors and cannot be faked, so these
/// drive the open body through the <c>OpenAndRegisterOverrideForTests</c> seam. The
/// core regression guard is <see cref="Recreate_after_connect_opens_a_fresh_session"/>:
/// a stale-session reconnect must rebuild rather than no-op (the gateway-restart bug).
/// </summary>
public sealed class GalaxyMxSessionReconnectTests
{
private static GalaxyMxAccessOptions MinimalOptions() => new(ClientName: "OtOpcUaTest");
private static GalaxyMxSession NewSession() => new(MinimalOptions());
/// <summary>A second <c>ConnectAsync</c> while connected must be a no-op (idempotent guard).</summary>
[Fact]
public async Task Connect_then_connect_again_is_a_noop()
{
var session = NewSession();
var openCount = 0;
session.OpenAndRegisterOverrideForTests = _ =>
{
openCount++;
return Task.CompletedTask;
};
await session.ConnectAsync(null!, CancellationToken.None);
openCount.ShouldBe(1);
await session.ConnectAsync(null!, CancellationToken.None);
openCount.ShouldBe(1);
}
/// <summary>
/// The regression guard for the gateway-restart bug: <c>RecreateAsync</c> must bypass
/// the no-op guard and re-run the open body even though a (stale) session is present.
/// </summary>
[Fact]
public async Task Recreate_after_connect_opens_a_fresh_session()
{
var session = NewSession();
var openCount = 0;
session.OpenAndRegisterOverrideForTests = _ =>
{
openCount++;
return Task.CompletedTask;
};
await session.ConnectAsync(null!, CancellationToken.None);
openCount.ShouldBe(1);
await session.RecreateAsync(null!, CancellationToken.None);
openCount.ShouldBe(2);
}
/// <summary><c>RecreateAsync</c> on a never-connected session still opens (teardown is a no-op first).</summary>
[Fact]
public async Task Recreate_when_never_connected_still_opens()
{
var session = NewSession();
var openCount = 0;
session.OpenAndRegisterOverrideForTests = _ =>
{
openCount++;
return Task.CompletedTask;
};
await session.RecreateAsync(null!, CancellationToken.None);
openCount.ShouldBe(1);
}
/// <summary>
/// A failed first connect must not leave <c>_connected</c> set — the next attempt has
/// to reach the open body again (partial-open teardown).
/// </summary>
[Fact]
public async Task Connect_failure_is_not_left_half_open()
{
var session = NewSession();
var openCount = 0;
session.OpenAndRegisterOverrideForTests = _ =>
{
// Throw on the first attempt, succeed on the second.
if (openCount++ == 0)
{
throw new InvalidOperationException("simulated open failure");
}
return Task.CompletedTask;
};
await Should.ThrowAsync<InvalidOperationException>(
async () => await session.ConnectAsync(null!, CancellationToken.None));
openCount.ShouldBe(1);
// The failed first attempt must not have latched _connected — the retry reaches the body.
await session.ConnectAsync(null!, CancellationToken.None);
openCount.ShouldBe(2);
}
}