Files
mxaccessgw/src/MxGateway.Server/Sessions/SessionManager.cs
T
Joseph Doherty 1d9e3afadd Resolve Server-002, -004, -005, -006 code-review findings
Server-002: the gateway never terminated leftover MxGateway.Worker.exe
processes at startup, contradicting gateway.md and CLAUDE.md. Added
IRunningProcessInspector/SystemRunningProcessInspector, OrphanWorkerTerminator,
and OrphanWorkerCleanupHostedService (best-effort, runs before sessions are
accepted); updated gateway.md to describe the implemented behavior.

Server-004: API-key scopes were persisted verbatim with no validation. Added
GatewayScopes.All/IsKnown; the CLI parser and dashboard create path now
reject unknown scope strings.

Server-005: a non-SqlException/InvalidOperationException fault on the initial
Galaxy hierarchy load faulted the BackgroundService. ExecuteAsync now catches
all non-cancellation exceptions on first load and RefreshCoreAsync broadens
its catch so the cache records Stale/Unavailable instead.

Server-006: OpenSessionAsync incremented the open-sessions gauge before
alarm auto-subscribe; an auto-subscribe failure leaked the gauge. The catch
path now calls SessionRemoved() when the gauge was incremented.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-18 21:31:10 -04:00

509 lines
18 KiB
C#

using System.Security.Cryptography;
using Google.Protobuf.WellKnownTypes;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Logging.Abstractions;
using Microsoft.Extensions.Options;
using MxGateway.Contracts;
using MxGateway.Contracts.Proto;
using MxGateway.Server.Configuration;
using MxGateway.Server.Metrics;
using MxGateway.Server.Workers;
namespace MxGateway.Server.Sessions;
public sealed class SessionManager : ISessionManager
{
public const string DefaultCloseReason = "client-close";
public const string GatewayShutdownReason = "gateway-shutdown";
public const string LeaseExpiredReason = "lease-expired";
private readonly ISessionRegistry _registry;
private readonly ISessionWorkerClientFactory _workerClientFactory;
private readonly GatewayMetrics _metrics;
private readonly TimeProvider _timeProvider;
private readonly ILogger<SessionManager> _logger;
private readonly GatewayOptions _options;
private readonly SemaphoreSlim _sessionSlots;
/// <summary>
/// Initializes a new instance of <see cref="SessionManager"/>.
/// </summary>
/// <param name="registry">Session registry.</param>
/// <param name="workerClientFactory">Worker client factory.</param>
/// <param name="options">Gateway options.</param>
/// <param name="metrics">Gateway metrics.</param>
/// <param name="timeProvider">Time provider for timestamps.</param>
/// <param name="logger">Logger.</param>
public SessionManager(
ISessionRegistry registry,
ISessionWorkerClientFactory workerClientFactory,
IOptions<GatewayOptions> options,
GatewayMetrics metrics,
TimeProvider? timeProvider = null,
ILogger<SessionManager>? logger = null)
{
_registry = registry ?? throw new ArgumentNullException(nameof(registry));
_workerClientFactory = workerClientFactory ?? throw new ArgumentNullException(nameof(workerClientFactory));
ArgumentNullException.ThrowIfNull(options);
_metrics = metrics ?? throw new ArgumentNullException(nameof(metrics));
_timeProvider = timeProvider ?? TimeProvider.System;
_logger = logger ?? NullLogger<SessionManager>.Instance;
_options = options.Value;
_sessionSlots = new SemaphoreSlim(_options.Sessions.MaxSessions, _options.Sessions.MaxSessions);
}
/// <summary>
/// Opens a new gateway session and connects to the worker.
/// </summary>
/// <param name="request">Session open request.</param>
/// <param name="clientIdentity">Client authentication identity.</param>
/// <param name="cancellationToken">Cancellation token.</param>
/// <returns>Opened gateway session.</returns>
public async Task<GatewaySession> OpenSessionAsync(
SessionOpenRequest request,
string? clientIdentity,
CancellationToken cancellationToken)
{
ArgumentNullException.ThrowIfNull(request);
EnsureSessionCapacity();
GatewaySession? session = null;
bool sessionOpenedRecorded = false;
try
{
session = CreateSession(request, clientIdentity);
if (!_registry.TryAdd(session))
{
throw new SessionManagerException(
SessionManagerErrorCode.OpenFailed,
$"Session id collision while opening session {session.SessionId}.");
}
session.TransitionTo(SessionState.StartingWorker);
IWorkerClient workerClient = await _workerClientFactory
.CreateAsync(session, cancellationToken)
.ConfigureAwait(false);
session.AttachWorkerClient(workerClient);
session.MarkReady();
_metrics.SessionOpened();
sessionOpenedRecorded = true;
await TryAutoSubscribeAlarmsAsync(session, cancellationToken).ConfigureAwait(false);
return session;
}
catch (Exception exception)
{
session?.MarkFaulted(exception.Message);
if (session is not null)
{
_registry.TryRemove(session.SessionId, out _);
await session.DisposeAsync().ConfigureAwait(false);
}
// If SessionOpened() already incremented the open-session gauge,
// a failure after that point (e.g. auto-subscribe rejection) must
// decrement it again so mxgateway.sessions.open does not leak.
if (sessionOpenedRecorded)
{
_metrics.SessionRemoved();
}
ReleaseSessionSlot();
_metrics.Fault(SessionManagerErrorCode.OpenFailed.ToString());
_logger.LogWarning(
exception,
"Failed to open gateway session {SessionId}.",
session?.SessionId ?? "<not-created>");
throw new SessionManagerException(
SessionManagerErrorCode.OpenFailed,
session is null ? "Failed to create session." : $"Failed to open session {session.SessionId}.",
exception);
}
}
/// <summary>
/// Attempts to retrieve a session by ID.
/// </summary>
/// <param name="sessionId">Session identifier.</param>
/// <param name="session">The session if found.</param>
/// <returns>True if session found; otherwise false.</returns>
public bool TryGetSession(
string sessionId,
out GatewaySession session)
{
return _registry.TryGet(sessionId, out session);
}
/// <summary>
/// Invokes a worker command on a session asynchronously.
/// </summary>
/// <param name="sessionId">Session identifier.</param>
/// <param name="command">Worker command.</param>
/// <param name="cancellationToken">Cancellation token.</param>
/// <returns>Command reply.</returns>
public async Task<WorkerCommandReply> InvokeAsync(
string sessionId,
WorkerCommand command,
CancellationToken cancellationToken)
{
GatewaySession session = GetRequiredSession(sessionId);
try
{
return await session.InvokeAsync(command, cancellationToken).ConfigureAwait(false);
}
catch (SessionManagerException)
{
throw;
}
catch (Exception exception)
{
if (session.WorkerClient?.State == WorkerClientState.Faulted)
{
session.MarkFaulted(exception.Message);
}
throw;
}
}
/// <summary>
/// Reads events from a session's event stream asynchronously.
/// </summary>
/// <param name="sessionId">Session identifier.</param>
/// <param name="cancellationToken">Cancellation token.</param>
/// <returns>Async enumerable of worker events.</returns>
public IAsyncEnumerable<WorkerEvent> ReadEventsAsync(
string sessionId,
CancellationToken cancellationToken)
{
GatewaySession session = GetRequiredSession(sessionId);
return session.ReadEventsAsync(cancellationToken);
}
/// <summary>
/// Closes a gateway session asynchronously.
/// </summary>
/// <param name="sessionId">Session identifier.</param>
/// <param name="cancellationToken">Cancellation token.</param>
/// <returns>Session close result.</returns>
public async Task<SessionCloseResult> CloseSessionAsync(
string sessionId,
CancellationToken cancellationToken)
{
GatewaySession session = GetRequiredSession(sessionId);
SessionCloseResult result = await CloseSessionCoreAsync(
session,
DefaultCloseReason,
cancellationToken).ConfigureAwait(false);
return result;
}
/// <summary>
/// Closes all sessions with expired leases asynchronously.
/// </summary>
/// <param name="now">Current time for lease expiration check.</param>
/// <param name="cancellationToken">Cancellation token.</param>
/// <returns>Count of sessions closed.</returns>
public async Task<int> CloseExpiredLeasesAsync(
DateTimeOffset now,
CancellationToken cancellationToken)
{
int closedCount = 0;
foreach (GatewaySession session in _registry.Snapshot())
{
if (!session.IsLeaseExpired(now))
{
continue;
}
await CloseSessionCoreAsync(session, LeaseExpiredReason, cancellationToken).ConfigureAwait(false);
closedCount++;
}
return closedCount;
}
/// <summary>
/// Shuts down all active sessions gracefully asynchronously.
/// </summary>
/// <param name="cancellationToken">Cancellation token.</param>
/// <returns>Completed task.</returns>
public async Task ShutdownAsync(CancellationToken cancellationToken)
{
foreach (GatewaySession session in _registry.Snapshot())
{
try
{
await CloseSessionCoreAsync(session, GatewayShutdownReason, cancellationToken).ConfigureAwait(false);
}
catch (Exception exception)
{
_logger.LogWarning(
exception,
"Graceful shutdown failed for session {SessionId}; killing worker.",
session.SessionId);
if (_registry.TryGet(session.SessionId, out _))
{
session.KillWorker(GatewayShutdownReason);
await RemoveSessionAsync(session).ConfigureAwait(false);
}
}
}
}
private async Task<SessionCloseResult> CloseSessionCoreAsync(
GatewaySession session,
string reason,
CancellationToken cancellationToken)
{
bool wasClosed = session.State == SessionState.Closed;
try
{
SessionCloseResult result = await session.CloseAsync(reason, cancellationToken).ConfigureAwait(false);
if (!wasClosed && !result.AlreadyClosed)
{
_metrics.SessionClosed();
}
await RemoveSessionAsync(session).ConfigureAwait(false);
return result;
}
catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested)
{
throw;
}
catch (SessionCloseStartedException exception)
{
session.MarkFaulted(exception.Message);
if (!wasClosed)
{
_metrics.SessionRemoved();
}
_metrics.Fault(SessionManagerErrorCode.CloseFailed.ToString());
await RemoveSessionAsync(session).ConfigureAwait(false);
throw new SessionManagerException(
SessionManagerErrorCode.CloseFailed,
$"Failed to close session {session.SessionId}.",
exception);
}
}
private GatewaySession GetRequiredSession(string sessionId)
{
if (!_registry.TryGet(sessionId, out GatewaySession session))
{
throw new SessionManagerException(
SessionManagerErrorCode.SessionNotFound,
$"Session {sessionId} was not found.");
}
return session;
}
private void EnsureSessionCapacity()
{
if (!_sessionSlots.Wait(0))
{
throw new SessionManagerException(
SessionManagerErrorCode.SessionLimitExceeded,
$"Gateway session limit {_options.Sessions.MaxSessions} has been reached.");
}
}
private async Task RemoveSessionAsync(GatewaySession session)
{
if (!_registry.TryRemove(session.SessionId, out GatewaySession? removedSession))
{
return;
}
_metrics.RemoveSessionEvents(session.SessionId);
ReleaseSessionSlot();
await removedSession.DisposeAsync().ConfigureAwait(false);
}
private void ReleaseSessionSlot()
{
try
{
_sessionSlots.Release();
}
catch (SemaphoreFullException)
{
}
}
private GatewaySession CreateSession(
SessionOpenRequest request,
string? clientIdentity)
{
string sessionId = CreateSessionId();
string backendName = string.IsNullOrWhiteSpace(request.RequestedBackend)
? GatewayContractInfo.DefaultBackendName
: request.RequestedBackend!;
TimeSpan commandTimeout = ResolveCommandTimeout(request.CommandTimeout);
TimeSpan startupTimeout = TimeSpan.FromSeconds(_options.Worker.StartupTimeoutSeconds);
TimeSpan shutdownTimeout = TimeSpan.FromSeconds(_options.Worker.ShutdownTimeoutSeconds);
TimeSpan leaseDuration = TimeSpan.FromSeconds(_options.Sessions.DefaultLeaseSeconds);
string pipeName = $"mxaccess-gateway-{Environment.ProcessId}-{sessionId}";
string nonce = CreateNonce();
DateTimeOffset openedAt = _timeProvider.GetUtcNow();
string clientCorrelationId = CreateClientCorrelationId(request.ClientSessionName, sessionId);
return new GatewaySession(
sessionId,
backendName,
pipeName,
nonce,
clientIdentity,
request.ClientSessionName,
clientCorrelationId,
commandTimeout,
startupTimeout,
shutdownTimeout,
leaseDuration,
openedAt);
}
private static string CreateClientCorrelationId(
string? clientSessionName,
string sessionId)
{
string clientName = string.IsNullOrWhiteSpace(clientSessionName)
? "client"
: clientSessionName!;
return $"{clientName}-{sessionId}";
}
private TimeSpan ResolveCommandTimeout(Duration? requestedTimeout)
{
if (requestedTimeout is null)
{
return TimeSpan.FromSeconds(_options.Sessions.DefaultCommandTimeoutSeconds);
}
TimeSpan timeout = requestedTimeout.ToTimeSpan();
return timeout <= TimeSpan.Zero
? TimeSpan.FromSeconds(_options.Sessions.DefaultCommandTimeoutSeconds)
: timeout;
}
private static string CreateSessionId()
{
return $"session-{Guid.NewGuid():N}";
}
private static string CreateNonce()
{
Span<byte> bytes = stackalloc byte[32];
RandomNumberGenerator.Fill(bytes);
return Convert.ToBase64String(bytes);
}
/// <summary>
/// If <c>Alarms.Enabled</c> is configured, issue a
/// <c>SubscribeAlarmsCommand</c> on the freshly-Ready session so the
/// worker's wnwrap consumer starts polling. Failure handling is
/// governed by <c>Alarms.RequireSubscribeOnOpen</c>:
/// <list type="bullet">
/// <item><description><c>true</c> — propagate the failure to fault the session.</description></item>
/// <item><description><c>false</c> (default) — log a warning and let the session continue serving data subscriptions.</description></item>
/// </list>
/// </summary>
private async Task TryAutoSubscribeAlarmsAsync(
GatewaySession session,
CancellationToken cancellationToken)
{
AlarmsOptions alarms = _options.Alarms;
if (!alarms.Enabled) return;
string subscription = ResolveAlarmSubscription(alarms);
if (string.IsNullOrWhiteSpace(subscription))
{
const string diagnostic =
"Alarms.Enabled is true but no SubscriptionExpression / DefaultArea is configured.";
if (alarms.RequireSubscribeOnOpen)
{
throw new SessionManagerException(
SessionManagerErrorCode.OpenFailed, diagnostic);
}
_logger.LogWarning(
"Auto-subscribe skipped for session {SessionId}: {Diagnostic}",
session.SessionId, diagnostic);
return;
}
WorkerCommand command = new WorkerCommand
{
Command = new MxCommand
{
Kind = MxCommandKind.SubscribeAlarms,
SubscribeAlarms = new SubscribeAlarmsCommand
{
SubscriptionExpression = subscription,
},
},
EnqueueTimestamp = Timestamp.FromDateTimeOffset(_timeProvider.GetUtcNow()),
};
try
{
WorkerCommandReply reply = await session.InvokeAsync(command, cancellationToken)
.ConfigureAwait(false);
ProtocolStatusCode? code = reply.Reply?.ProtocolStatus?.Code;
if (code != ProtocolStatusCode.Ok)
{
string diagnostic = reply.Reply?.DiagnosticMessage
?? reply.Reply?.ProtocolStatus?.Message
?? "Worker rejected SubscribeAlarms.";
if (alarms.RequireSubscribeOnOpen)
{
throw new SessionManagerException(
SessionManagerErrorCode.OpenFailed,
$"Auto-subscribe failed for session {session.SessionId}: {diagnostic}");
}
_logger.LogWarning(
"Auto-subscribe failed for session {SessionId} (status {StatusCode}): {Diagnostic}",
session.SessionId, code, diagnostic);
return;
}
_logger.LogInformation(
"Alarm auto-subscribe succeeded for session {SessionId} on {Subscription}.",
session.SessionId, subscription);
}
catch (SessionManagerException)
{
throw;
}
catch (Exception ex) when (!alarms.RequireSubscribeOnOpen)
{
_logger.LogWarning(
ex,
"Auto-subscribe threw for session {SessionId} on {Subscription}; alarm path remains inactive.",
session.SessionId, subscription);
}
}
private static string ResolveAlarmSubscription(AlarmsOptions alarms)
{
if (!string.IsNullOrWhiteSpace(alarms.SubscriptionExpression))
{
return alarms.SubscriptionExpression;
}
if (!string.IsNullOrWhiteSpace(alarms.DefaultArea))
{
return $@"\\{Environment.MachineName}\Galaxy!{alarms.DefaultArea}";
}
return string.Empty;
}
}