fix(galaxy): reconnect recreates a faulted session instead of no-op'ing

This commit is contained in:
Joseph Doherty
2026-06-11 09:10:52 -04:00
parent 4f291ed09c
commit 43b96441a5
3 changed files with 177 additions and 15 deletions
@@ -282,15 +282,17 @@ public sealed class GalaxyDriver
}
/// <summary>
/// Reopen callback for <see cref="ReconnectSupervisor"/>: re-Register the gw session.
/// If the session never connected, this is a fresh ConnectAsync; otherwise it's a
/// reconnect against the existing client.
/// Reopen callback for <see cref="ReconnectSupervisor"/>: recreate the gw session. After a
/// gateway restart the existing session handle is faulted / stale but non-null, so a plain
/// <c>ConnectAsync</c> would no-op and the supervisor would loop forever replaying against a
/// dead session. <see cref="GalaxyMxSession.RecreateAsync"/> disposes the stale session +
/// owned client and rebuilds, so a never-connected session still opens fresh.
/// </summary>
private async Task ReopenAsync(CancellationToken cancellationToken)
{
if (_ownedMxSession is null) return;
var clientOptions = BuildClientOptions(_options.Gateway);
await _ownedMxSession.ConnectAsync(clientOptions, cancellationToken).ConfigureAwait(false);
await _ownedMxSession.RecreateAsync(clientOptions, cancellationToken).ConfigureAwait(false);
}
/// <summary>
@@ -28,6 +28,7 @@ public sealed class GalaxyMxSession : IAsyncDisposable
private MxGatewayClient? _ownedClient;
private MxGatewaySession? _session;
private int _serverHandle;
private bool _connected;
private bool _disposed;
/// <summary>Initializes a new instance of the GalaxyMxSession class.</summary>
@@ -48,24 +49,65 @@ public sealed class GalaxyMxSession : IAsyncDisposable
/// </summary>
public int ServerHandle => _serverHandle;
/// <summary>
/// Test seam — when set, replaces the real "open session + register" work so unit tests can
/// exercise the connect / recreate orchestration without a live gateway (the SDK session/client
/// types are sealed + internal-ctor and cannot be faked). Null in production = real gateway path.
/// </summary>
internal Func<CancellationToken, Task>? OpenAndRegisterOverrideForTests { get; set; }
/// <summary>
/// Connect the underlying gateway client + open an MXAccess session + register the
/// configured client name. Idempotent — second calls are no-ops while
/// <see cref="IsConnected"/> is true.
/// configured client name. Idempotent — second calls are no-ops while a connect has
/// already succeeded. Use <see cref="RecreateAsync"/> to force a rebuild after the
/// gateway restarts and leaves a faulted / stale session handle.
/// </summary>
/// <param name="clientOptions">The MX gateway client options.</param>
/// <param name="cancellationToken">The cancellation token.</param>
public async Task ConnectAsync(MxGatewayClientOptions clientOptions, CancellationToken cancellationToken)
{
ObjectDisposedException.ThrowIf(_disposed, this);
if (_session is not null) return;
if (_connected) return;
_ownedClient = MxGatewayClient.Create(clientOptions);
_session = await _ownedClient.OpenSessionAsync(cancellationToken: cancellationToken).ConfigureAwait(false);
_serverHandle = await _session.RegisterAsync(_options.ClientName, cancellationToken).ConfigureAwait(false);
_logger.LogInformation(
"GalaxyMxSession connected — clientName={ClientName} serverHandle={Handle}",
_options.ClientName, _serverHandle);
try
{
if (OpenAndRegisterOverrideForTests is not null)
{
await OpenAndRegisterOverrideForTests(cancellationToken).ConfigureAwait(false);
}
else
{
_ownedClient = MxGatewayClient.Create(clientOptions);
_session = await _ownedClient.OpenSessionAsync(cancellationToken: cancellationToken).ConfigureAwait(false);
_serverHandle = await _session.RegisterAsync(_options.ClientName, cancellationToken).ConfigureAwait(false);
}
_connected = true;
_logger.LogInformation(
"GalaxyMxSession connected — clientName={ClientName} serverHandle={Handle}",
_options.ClientName, _serverHandle);
}
catch
{
// A partial open (e.g. OpenSession succeeded but Register threw) must not leave a half-open
// handle that blocks the next reconnect. Tear the partial state down so a retry rebuilds cleanly.
await TeardownSessionAsync().ConfigureAwait(false);
throw;
}
}
/// <summary>
/// Disposes the current (faulted / stale) session + owned client and rebuilds. Use this on
/// reconnect: <see cref="ConnectAsync"/> alone no-ops while a (possibly dead) session handle is
/// still present, so the supervisor's reopen must route through here to actually re-establish.
/// </summary>
/// <param name="clientOptions">The MX gateway client options.</param>
/// <param name="cancellationToken">The cancellation token.</param>
public async Task RecreateAsync(MxGatewayClientOptions clientOptions, CancellationToken cancellationToken)
{
ObjectDisposedException.ThrowIf(_disposed, this);
await TeardownSessionAsync().ConfigureAwait(false);
await ConnectAsync(clientOptions, cancellationToken).ConfigureAwait(false);
}
/// <summary>
@@ -93,19 +135,31 @@ public sealed class GalaxyMxSession : IAsyncDisposable
{
if (_disposed) return;
_disposed = true;
await TeardownSessionAsync().ConfigureAwait(false);
}
/// <summary>
/// Resets the connect flag and best-effort disposes + nulls the owned session and client.
/// Shared by <see cref="DisposeAsync"/> (final teardown) and <see cref="RecreateAsync"/> /
/// the partial-open recovery path (rebuild teardown) — it does NOT set <c>_disposed</c>, so
/// a torn-down session can be re-opened.
/// </summary>
private async Task TeardownSessionAsync()
{
_connected = false;
if (_session is not null)
{
try { await _session.DisposeAsync().ConfigureAwait(false); }
catch (Exception ex) { _logger.LogWarning(ex, "GalaxyMxSession session dispose failed (best-effort)"); }
_session = null;
}
_session = null;
if (_ownedClient is not null)
{
try { await _ownedClient.DisposeAsync().ConfigureAwait(false); }
catch (Exception ex) { _logger.LogWarning(ex, "GalaxyMxSession client dispose failed (best-effort)"); }
_ownedClient = null;
}
_ownedClient = null;
}
}