Improve gateway reliability and dashboard docs

This commit is contained in:
Joseph Doherty
2026-04-28 00:13:22 -04:00
parent bd4a09a35e
commit 4fc355b357
61 changed files with 1722 additions and 150 deletions
+60 -15
View File
@@ -23,6 +23,7 @@ public sealed class SessionManager : ISessionManager
private readonly TimeProvider _timeProvider;
private readonly ILogger<SessionManager> _logger;
private readonly GatewayOptions _options;
private readonly SemaphoreSlim _sessionSlots;
public SessionManager(
ISessionRegistry registry,
@@ -39,6 +40,7 @@ public sealed class SessionManager : ISessionManager
_timeProvider = timeProvider ?? TimeProvider.System;
_logger = logger ?? NullLogger<SessionManager>.Instance;
_options = options.Value;
_sessionSlots = new SemaphoreSlim(_options.Sessions.MaxSessions, _options.Sessions.MaxSessions);
}
public async Task<GatewaySession> OpenSessionAsync(
@@ -49,16 +51,17 @@ public sealed class SessionManager : ISessionManager
ArgumentNullException.ThrowIfNull(request);
EnsureSessionCapacity();
GatewaySession session = CreateSession(request, clientIdentity);
if (!_registry.TryAdd(session))
{
throw new SessionManagerException(
SessionManagerErrorCode.OpenFailed,
$"Session id collision while opening session {session.SessionId}.");
}
GatewaySession? session = null;
try
{
session = CreateSession(request, clientIdentity);
if (!_registry.TryAdd(session))
{
throw new SessionManagerException(
SessionManagerErrorCode.OpenFailed,
$"Session id collision while opening session {session.SessionId}.");
}
session.TransitionTo(SessionState.StartingWorker);
IWorkerClient workerClient = await _workerClientFactory
.CreateAsync(session, cancellationToken)
@@ -72,18 +75,23 @@ public sealed class SessionManager : ISessionManager
}
catch (Exception exception)
{
session.MarkFaulted(exception.Message);
_registry.TryRemove(session.SessionId, out _);
await session.DisposeAsync().ConfigureAwait(false);
session?.MarkFaulted(exception.Message);
if (session is not null)
{
_registry.TryRemove(session.SessionId, out _);
await session.DisposeAsync().ConfigureAwait(false);
}
ReleaseSessionSlot();
_metrics.Fault(SessionManagerErrorCode.OpenFailed.ToString());
_logger.LogWarning(
exception,
"Failed to open gateway session {SessionId}.",
session.SessionId);
session?.SessionId ?? "<not-created>");
throw new SessionManagerException(
SessionManagerErrorCode.OpenFailed,
$"Failed to open session {session.SessionId}.",
session is null ? "Failed to create session." : $"Failed to open session {session.SessionId}.",
exception);
}
}
@@ -177,6 +185,7 @@ public sealed class SessionManager : ISessionManager
"Graceful shutdown failed for session {SessionId}; killing worker.",
session.SessionId);
session.KillWorker(GatewayShutdownReason);
await RemoveSessionAsync(session).ConfigureAwait(false);
}
}
}
@@ -195,6 +204,7 @@ public sealed class SessionManager : ISessionManager
_metrics.SessionClosed();
}
await RemoveSessionAsync(session).ConfigureAwait(false);
return result;
}
catch (Exception exception)
@@ -222,7 +232,7 @@ public sealed class SessionManager : ISessionManager
private void EnsureSessionCapacity()
{
if (_registry.ActiveCount >= _options.Sessions.MaxSessions)
if (!_sessionSlots.Wait(0))
{
throw new SessionManagerException(
SessionManagerErrorCode.SessionLimitExceeded,
@@ -230,6 +240,29 @@ public sealed class SessionManager : ISessionManager
}
}
private async Task RemoveSessionAsync(GatewaySession session)
{
if (!_registry.TryRemove(session.SessionId, out GatewaySession? removedSession))
{
return;
}
_metrics.RemoveSessionEvents(session.SessionId);
ReleaseSessionSlot();
await removedSession.DisposeAsync().ConfigureAwait(false);
}
private void ReleaseSessionSlot()
{
try
{
_sessionSlots.Release();
}
catch (SemaphoreFullException)
{
}
}
private GatewaySession CreateSession(
SessionOpenRequest request,
string? clientIdentity)
@@ -244,6 +277,7 @@ public sealed class SessionManager : ISessionManager
string pipeName = $"mxaccess-gateway-{Environment.ProcessId}-{sessionId}";
string nonce = CreateNonce();
DateTimeOffset openedAt = _timeProvider.GetUtcNow();
string clientCorrelationId = CreateClientCorrelationId(request.ClientSessionName, sessionId);
return new GatewaySession(
sessionId,
@@ -252,13 +286,24 @@ public sealed class SessionManager : ISessionManager
nonce,
clientIdentity,
request.ClientSessionName,
request.ClientCorrelationId,
clientCorrelationId,
commandTimeout,
startupTimeout,
shutdownTimeout,
openedAt);
}
private static string CreateClientCorrelationId(
string? clientSessionName,
string sessionId)
{
string clientName = string.IsNullOrWhiteSpace(clientSessionName)
? "client"
: clientSessionName!;
return $"{clientName}-{sessionId}";
}
private TimeSpan ResolveCommandTimeout(Duration? requestedTimeout)
{
if (requestedTimeout is null)
@@ -7,6 +7,7 @@ public static class SessionServiceCollectionExtensions
services.AddSingleton<ISessionRegistry, SessionRegistry>();
services.AddSingleton<ISessionWorkerClientFactory, SessionWorkerClientFactory>();
services.AddSingleton<ISessionManager, SessionManager>();
services.AddHostedService<SessionShutdownHostedService>();
return services;
}
@@ -0,0 +1,26 @@
using Microsoft.Extensions.Hosting;
using Microsoft.Extensions.Logging;
namespace MxGateway.Server.Sessions;
public sealed class SessionShutdownHostedService(
ISessionManager sessionManager,
ILogger<SessionShutdownHostedService> logger) : IHostedService
{
public Task StartAsync(CancellationToken cancellationToken)
{
return Task.CompletedTask;
}
public async Task StopAsync(CancellationToken cancellationToken)
{
try
{
await sessionManager.ShutdownAsync(cancellationToken).ConfigureAwait(false);
}
catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested)
{
logger.LogWarning("Gateway session shutdown was canceled by host shutdown timeout.");
}
}
}
@@ -74,6 +74,7 @@ public sealed class SessionWorkerClientFactory : ISessionWorkerClientFactory
HeartbeatGrace = TimeSpan.FromSeconds(_options.Worker.HeartbeatGraceSeconds),
HeartbeatCheckInterval = TimeSpan.FromSeconds(_options.Worker.HeartbeatIntervalSeconds),
EventChannelCapacity = _options.Events.QueueCapacity,
MaxPendingCommands = _options.Sessions.MaxPendingCommandsPerSession,
};
workerClient = new WorkerClient(