From 43b96441a5dfdaa338bba9f8d89ea803397404b6 Mon Sep 17 00:00:00 2001 From: Joseph Doherty Date: Thu, 11 Jun 2026 09:10:52 -0400 Subject: [PATCH] fix(galaxy): reconnect recreates a faulted session instead of no-op'ing --- .../GalaxyDriver.cs | 10 +- .../Runtime/GalaxyMxSession.cs | 76 +++++++++++-- .../Runtime/GalaxyMxSessionReconnectTests.cs | 106 ++++++++++++++++++ 3 files changed, 177 insertions(+), 15 deletions(-) create mode 100644 tests/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Tests/Runtime/GalaxyMxSessionReconnectTests.cs diff --git a/src/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Galaxy/GalaxyDriver.cs b/src/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Galaxy/GalaxyDriver.cs index 1bcc63b6..58412d0f 100644 --- a/src/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Galaxy/GalaxyDriver.cs +++ b/src/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Galaxy/GalaxyDriver.cs @@ -282,15 +282,17 @@ public sealed class GalaxyDriver } /// - /// Reopen callback for : re-Register the gw session. - /// If the session never connected, this is a fresh ConnectAsync; otherwise it's a - /// reconnect against the existing client. + /// Reopen callback for : recreate the gw session. After a + /// gateway restart the existing session handle is faulted / stale but non-null, so a plain + /// ConnectAsync would no-op and the supervisor would loop forever replaying against a + /// dead session. disposes the stale session + + /// owned client and rebuilds, so a never-connected session still opens fresh. /// private async Task ReopenAsync(CancellationToken cancellationToken) { if (_ownedMxSession is null) return; var clientOptions = BuildClientOptions(_options.Gateway); - await _ownedMxSession.ConnectAsync(clientOptions, cancellationToken).ConfigureAwait(false); + await _ownedMxSession.RecreateAsync(clientOptions, cancellationToken).ConfigureAwait(false); } /// diff --git a/src/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Galaxy/Runtime/GalaxyMxSession.cs b/src/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Galaxy/Runtime/GalaxyMxSession.cs index 7913c69c..d48d3528 100644 --- a/src/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Galaxy/Runtime/GalaxyMxSession.cs +++ b/src/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Galaxy/Runtime/GalaxyMxSession.cs @@ -28,6 +28,7 @@ public sealed class GalaxyMxSession : IAsyncDisposable private MxGatewayClient? _ownedClient; private MxGatewaySession? _session; private int _serverHandle; + private bool _connected; private bool _disposed; /// Initializes a new instance of the GalaxyMxSession class. @@ -48,24 +49,65 @@ public sealed class GalaxyMxSession : IAsyncDisposable /// public int ServerHandle => _serverHandle; + /// + /// Test seam — when set, replaces the real "open session + register" work so unit tests can + /// exercise the connect / recreate orchestration without a live gateway (the SDK session/client + /// types are sealed + internal-ctor and cannot be faked). Null in production = real gateway path. + /// + internal Func? OpenAndRegisterOverrideForTests { get; set; } + /// /// Connect the underlying gateway client + open an MXAccess session + register the - /// configured client name. Idempotent — second calls are no-ops while - /// is true. + /// configured client name. Idempotent — second calls are no-ops while a connect has + /// already succeeded. Use to force a rebuild after the + /// gateway restarts and leaves a faulted / stale session handle. /// /// The MX gateway client options. /// The cancellation token. public async Task ConnectAsync(MxGatewayClientOptions clientOptions, CancellationToken cancellationToken) { ObjectDisposedException.ThrowIf(_disposed, this); - if (_session is not null) return; + if (_connected) return; - _ownedClient = MxGatewayClient.Create(clientOptions); - _session = await _ownedClient.OpenSessionAsync(cancellationToken: cancellationToken).ConfigureAwait(false); - _serverHandle = await _session.RegisterAsync(_options.ClientName, cancellationToken).ConfigureAwait(false); - _logger.LogInformation( - "GalaxyMxSession connected — clientName={ClientName} serverHandle={Handle}", - _options.ClientName, _serverHandle); + try + { + if (OpenAndRegisterOverrideForTests is not null) + { + await OpenAndRegisterOverrideForTests(cancellationToken).ConfigureAwait(false); + } + else + { + _ownedClient = MxGatewayClient.Create(clientOptions); + _session = await _ownedClient.OpenSessionAsync(cancellationToken: cancellationToken).ConfigureAwait(false); + _serverHandle = await _session.RegisterAsync(_options.ClientName, cancellationToken).ConfigureAwait(false); + } + + _connected = true; + _logger.LogInformation( + "GalaxyMxSession connected — clientName={ClientName} serverHandle={Handle}", + _options.ClientName, _serverHandle); + } + catch + { + // A partial open (e.g. OpenSession succeeded but Register threw) must not leave a half-open + // handle that blocks the next reconnect. Tear the partial state down so a retry rebuilds cleanly. + await TeardownSessionAsync().ConfigureAwait(false); + throw; + } + } + + /// + /// Disposes the current (faulted / stale) session + owned client and rebuilds. Use this on + /// reconnect: alone no-ops while a (possibly dead) session handle is + /// still present, so the supervisor's reopen must route through here to actually re-establish. + /// + /// The MX gateway client options. + /// The cancellation token. + public async Task RecreateAsync(MxGatewayClientOptions clientOptions, CancellationToken cancellationToken) + { + ObjectDisposedException.ThrowIf(_disposed, this); + await TeardownSessionAsync().ConfigureAwait(false); + await ConnectAsync(clientOptions, cancellationToken).ConfigureAwait(false); } /// @@ -93,19 +135,31 @@ public sealed class GalaxyMxSession : IAsyncDisposable { if (_disposed) return; _disposed = true; + await TeardownSessionAsync().ConfigureAwait(false); + } + + /// + /// Resets the connect flag and best-effort disposes + nulls the owned session and client. + /// Shared by (final teardown) and / + /// the partial-open recovery path (rebuild teardown) — it does NOT set _disposed, so + /// a torn-down session can be re-opened. + /// + private async Task TeardownSessionAsync() + { + _connected = false; if (_session is not null) { try { await _session.DisposeAsync().ConfigureAwait(false); } catch (Exception ex) { _logger.LogWarning(ex, "GalaxyMxSession session dispose failed (best-effort)"); } + _session = null; } - _session = null; if (_ownedClient is not null) { try { await _ownedClient.DisposeAsync().ConfigureAwait(false); } catch (Exception ex) { _logger.LogWarning(ex, "GalaxyMxSession client dispose failed (best-effort)"); } + _ownedClient = null; } - _ownedClient = null; } } diff --git a/tests/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Tests/Runtime/GalaxyMxSessionReconnectTests.cs b/tests/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Tests/Runtime/GalaxyMxSessionReconnectTests.cs new file mode 100644 index 00000000..b1e689eb --- /dev/null +++ b/tests/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Tests/Runtime/GalaxyMxSessionReconnectTests.cs @@ -0,0 +1,106 @@ +using Shouldly; +using Xunit; +using ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Config; +using ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Runtime; + +namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Tests.Runtime; + +/// +/// Connect / recreate orchestration tests for . The SDK +/// session/client types are sealed with internal ctors and cannot be faked, so these +/// drive the open body through the OpenAndRegisterOverrideForTests seam. The +/// core regression guard is : +/// a stale-session reconnect must rebuild rather than no-op (the gateway-restart bug). +/// +public sealed class GalaxyMxSessionReconnectTests +{ + private static GalaxyMxAccessOptions MinimalOptions() => new(ClientName: "OtOpcUaTest"); + + private static GalaxyMxSession NewSession() => new(MinimalOptions()); + + /// A second ConnectAsync while connected must be a no-op (idempotent guard). + [Fact] + public async Task Connect_then_connect_again_is_a_noop() + { + var session = NewSession(); + var openCount = 0; + session.OpenAndRegisterOverrideForTests = _ => + { + openCount++; + return Task.CompletedTask; + }; + + await session.ConnectAsync(null!, CancellationToken.None); + openCount.ShouldBe(1); + + await session.ConnectAsync(null!, CancellationToken.None); + openCount.ShouldBe(1); + } + + /// + /// The regression guard for the gateway-restart bug: RecreateAsync must bypass + /// the no-op guard and re-run the open body even though a (stale) session is present. + /// + [Fact] + public async Task Recreate_after_connect_opens_a_fresh_session() + { + var session = NewSession(); + var openCount = 0; + session.OpenAndRegisterOverrideForTests = _ => + { + openCount++; + return Task.CompletedTask; + }; + + await session.ConnectAsync(null!, CancellationToken.None); + openCount.ShouldBe(1); + + await session.RecreateAsync(null!, CancellationToken.None); + openCount.ShouldBe(2); + } + + /// RecreateAsync on a never-connected session still opens (teardown is a no-op first). + [Fact] + public async Task Recreate_when_never_connected_still_opens() + { + var session = NewSession(); + var openCount = 0; + session.OpenAndRegisterOverrideForTests = _ => + { + openCount++; + return Task.CompletedTask; + }; + + await session.RecreateAsync(null!, CancellationToken.None); + openCount.ShouldBe(1); + } + + /// + /// A failed first connect must not leave _connected set — the next attempt has + /// to reach the open body again (partial-open teardown). + /// + [Fact] + public async Task Connect_failure_is_not_left_half_open() + { + var session = NewSession(); + var openCount = 0; + session.OpenAndRegisterOverrideForTests = _ => + { + // Throw on the first attempt, succeed on the second. + if (openCount++ == 0) + { + throw new InvalidOperationException("simulated open failure"); + } + + return Task.CompletedTask; + }; + + await Should.ThrowAsync( + async () => await session.ConnectAsync(null!, CancellationToken.None)); + openCount.ShouldBe(1); + + // The failed first attempt must not have latched _connected — the retry reaches the body. + await session.ConnectAsync(null!, CancellationToken.None); + openCount.ShouldBe(2); + } +}