fix(galaxy): reconnect recreates a faulted session instead of no-op'ing
This commit is contained in:
@@ -282,15 +282,17 @@ public sealed class GalaxyDriver
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Reopen callback for <see cref="ReconnectSupervisor"/>: re-Register the gw session.
|
||||
/// If the session never connected, this is a fresh ConnectAsync; otherwise it's a
|
||||
/// reconnect against the existing client.
|
||||
/// Reopen callback for <see cref="ReconnectSupervisor"/>: recreate the gw session. After a
|
||||
/// gateway restart the existing session handle is faulted / stale but non-null, so a plain
|
||||
/// <c>ConnectAsync</c> would no-op and the supervisor would loop forever replaying against a
|
||||
/// dead session. <see cref="GalaxyMxSession.RecreateAsync"/> disposes the stale session +
|
||||
/// owned client and rebuilds, so a never-connected session still opens fresh.
|
||||
/// </summary>
|
||||
private async Task ReopenAsync(CancellationToken cancellationToken)
|
||||
{
|
||||
if (_ownedMxSession is null) return;
|
||||
var clientOptions = BuildClientOptions(_options.Gateway);
|
||||
await _ownedMxSession.ConnectAsync(clientOptions, cancellationToken).ConfigureAwait(false);
|
||||
await _ownedMxSession.RecreateAsync(clientOptions, cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
|
||||
@@ -28,6 +28,7 @@ public sealed class GalaxyMxSession : IAsyncDisposable
|
||||
private MxGatewayClient? _ownedClient;
|
||||
private MxGatewaySession? _session;
|
||||
private int _serverHandle;
|
||||
private bool _connected;
|
||||
private bool _disposed;
|
||||
|
||||
/// <summary>Initializes a new instance of the GalaxyMxSession class.</summary>
|
||||
@@ -48,24 +49,65 @@ public sealed class GalaxyMxSession : IAsyncDisposable
|
||||
/// </summary>
|
||||
public int ServerHandle => _serverHandle;
|
||||
|
||||
/// <summary>
|
||||
/// Test seam — when set, replaces the real "open session + register" work so unit tests can
|
||||
/// exercise the connect / recreate orchestration without a live gateway (the SDK session/client
|
||||
/// types are sealed + internal-ctor and cannot be faked). Null in production = real gateway path.
|
||||
/// </summary>
|
||||
internal Func<CancellationToken, Task>? OpenAndRegisterOverrideForTests { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Connect the underlying gateway client + open an MXAccess session + register the
|
||||
/// configured client name. Idempotent — second calls are no-ops while
|
||||
/// <see cref="IsConnected"/> is true.
|
||||
/// configured client name. Idempotent — second calls are no-ops while a connect has
|
||||
/// already succeeded. Use <see cref="RecreateAsync"/> to force a rebuild after the
|
||||
/// gateway restarts and leaves a faulted / stale session handle.
|
||||
/// </summary>
|
||||
/// <param name="clientOptions">The MX gateway client options.</param>
|
||||
/// <param name="cancellationToken">The cancellation token.</param>
|
||||
public async Task ConnectAsync(MxGatewayClientOptions clientOptions, CancellationToken cancellationToken)
|
||||
{
|
||||
ObjectDisposedException.ThrowIf(_disposed, this);
|
||||
if (_session is not null) return;
|
||||
if (_connected) return;
|
||||
|
||||
_ownedClient = MxGatewayClient.Create(clientOptions);
|
||||
_session = await _ownedClient.OpenSessionAsync(cancellationToken: cancellationToken).ConfigureAwait(false);
|
||||
_serverHandle = await _session.RegisterAsync(_options.ClientName, cancellationToken).ConfigureAwait(false);
|
||||
_logger.LogInformation(
|
||||
"GalaxyMxSession connected — clientName={ClientName} serverHandle={Handle}",
|
||||
_options.ClientName, _serverHandle);
|
||||
try
|
||||
{
|
||||
if (OpenAndRegisterOverrideForTests is not null)
|
||||
{
|
||||
await OpenAndRegisterOverrideForTests(cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
else
|
||||
{
|
||||
_ownedClient = MxGatewayClient.Create(clientOptions);
|
||||
_session = await _ownedClient.OpenSessionAsync(cancellationToken: cancellationToken).ConfigureAwait(false);
|
||||
_serverHandle = await _session.RegisterAsync(_options.ClientName, cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
_connected = true;
|
||||
_logger.LogInformation(
|
||||
"GalaxyMxSession connected — clientName={ClientName} serverHandle={Handle}",
|
||||
_options.ClientName, _serverHandle);
|
||||
}
|
||||
catch
|
||||
{
|
||||
// A partial open (e.g. OpenSession succeeded but Register threw) must not leave a half-open
|
||||
// handle that blocks the next reconnect. Tear the partial state down so a retry rebuilds cleanly.
|
||||
await TeardownSessionAsync().ConfigureAwait(false);
|
||||
throw;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Disposes the current (faulted / stale) session + owned client and rebuilds. Use this on
|
||||
/// reconnect: <see cref="ConnectAsync"/> alone no-ops while a (possibly dead) session handle is
|
||||
/// still present, so the supervisor's reopen must route through here to actually re-establish.
|
||||
/// </summary>
|
||||
/// <param name="clientOptions">The MX gateway client options.</param>
|
||||
/// <param name="cancellationToken">The cancellation token.</param>
|
||||
public async Task RecreateAsync(MxGatewayClientOptions clientOptions, CancellationToken cancellationToken)
|
||||
{
|
||||
ObjectDisposedException.ThrowIf(_disposed, this);
|
||||
await TeardownSessionAsync().ConfigureAwait(false);
|
||||
await ConnectAsync(clientOptions, cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
@@ -93,19 +135,31 @@ public sealed class GalaxyMxSession : IAsyncDisposable
|
||||
{
|
||||
if (_disposed) return;
|
||||
_disposed = true;
|
||||
await TeardownSessionAsync().ConfigureAwait(false);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Resets the connect flag and best-effort disposes + nulls the owned session and client.
|
||||
/// Shared by <see cref="DisposeAsync"/> (final teardown) and <see cref="RecreateAsync"/> /
|
||||
/// the partial-open recovery path (rebuild teardown) — it does NOT set <c>_disposed</c>, so
|
||||
/// a torn-down session can be re-opened.
|
||||
/// </summary>
|
||||
private async Task TeardownSessionAsync()
|
||||
{
|
||||
_connected = false;
|
||||
|
||||
if (_session is not null)
|
||||
{
|
||||
try { await _session.DisposeAsync().ConfigureAwait(false); }
|
||||
catch (Exception ex) { _logger.LogWarning(ex, "GalaxyMxSession session dispose failed (best-effort)"); }
|
||||
_session = null;
|
||||
}
|
||||
_session = null;
|
||||
|
||||
if (_ownedClient is not null)
|
||||
{
|
||||
try { await _ownedClient.DisposeAsync().ConfigureAwait(false); }
|
||||
catch (Exception ex) { _logger.LogWarning(ex, "GalaxyMxSession client dispose failed (best-effort)"); }
|
||||
_ownedClient = null;
|
||||
}
|
||||
_ownedClient = null;
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user