1d9e3afadd
Server-002: the gateway never terminated leftover MxGateway.Worker.exe processes at startup, contradicting gateway.md and CLAUDE.md. Added IRunningProcessInspector/SystemRunningProcessInspector, OrphanWorkerTerminator, and OrphanWorkerCleanupHostedService (best-effort, runs before sessions are accepted); updated gateway.md to describe the implemented behavior. Server-004: API-key scopes were persisted verbatim with no validation. Added GatewayScopes.All/IsKnown; the CLI parser and dashboard create path now reject unknown scope strings. Server-005: a non-SqlException/InvalidOperationException fault on the initial Galaxy hierarchy load faulted the BackgroundService. ExecuteAsync now catches all non-cancellation exceptions on first load and RefreshCoreAsync broadens its catch so the cache records Stale/Unavailable instead. Server-006: OpenSessionAsync incremented the open-sessions gauge before alarm auto-subscribe; an auto-subscribe failure leaked the gauge. The catch path now calls SessionRemoved() when the gauge was incremented. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
509 lines
18 KiB
C#
509 lines
18 KiB
C#
using System.Security.Cryptography;
|
|
using Google.Protobuf.WellKnownTypes;
|
|
using Microsoft.Extensions.Logging;
|
|
using Microsoft.Extensions.Logging.Abstractions;
|
|
using Microsoft.Extensions.Options;
|
|
using MxGateway.Contracts;
|
|
using MxGateway.Contracts.Proto;
|
|
using MxGateway.Server.Configuration;
|
|
using MxGateway.Server.Metrics;
|
|
using MxGateway.Server.Workers;
|
|
|
|
namespace MxGateway.Server.Sessions;
|
|
|
|
public sealed class SessionManager : ISessionManager
|
|
{
|
|
public const string DefaultCloseReason = "client-close";
|
|
public const string GatewayShutdownReason = "gateway-shutdown";
|
|
public const string LeaseExpiredReason = "lease-expired";
|
|
|
|
private readonly ISessionRegistry _registry;
|
|
private readonly ISessionWorkerClientFactory _workerClientFactory;
|
|
private readonly GatewayMetrics _metrics;
|
|
private readonly TimeProvider _timeProvider;
|
|
private readonly ILogger<SessionManager> _logger;
|
|
private readonly GatewayOptions _options;
|
|
private readonly SemaphoreSlim _sessionSlots;
|
|
|
|
/// <summary>
|
|
/// Initializes a new instance of <see cref="SessionManager"/>.
|
|
/// </summary>
|
|
/// <param name="registry">Session registry.</param>
|
|
/// <param name="workerClientFactory">Worker client factory.</param>
|
|
/// <param name="options">Gateway options.</param>
|
|
/// <param name="metrics">Gateway metrics.</param>
|
|
/// <param name="timeProvider">Time provider for timestamps.</param>
|
|
/// <param name="logger">Logger.</param>
|
|
public SessionManager(
|
|
ISessionRegistry registry,
|
|
ISessionWorkerClientFactory workerClientFactory,
|
|
IOptions<GatewayOptions> options,
|
|
GatewayMetrics metrics,
|
|
TimeProvider? timeProvider = null,
|
|
ILogger<SessionManager>? logger = null)
|
|
{
|
|
_registry = registry ?? throw new ArgumentNullException(nameof(registry));
|
|
_workerClientFactory = workerClientFactory ?? throw new ArgumentNullException(nameof(workerClientFactory));
|
|
ArgumentNullException.ThrowIfNull(options);
|
|
_metrics = metrics ?? throw new ArgumentNullException(nameof(metrics));
|
|
_timeProvider = timeProvider ?? TimeProvider.System;
|
|
_logger = logger ?? NullLogger<SessionManager>.Instance;
|
|
_options = options.Value;
|
|
_sessionSlots = new SemaphoreSlim(_options.Sessions.MaxSessions, _options.Sessions.MaxSessions);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Opens a new gateway session and connects to the worker.
|
|
/// </summary>
|
|
/// <param name="request">Session open request.</param>
|
|
/// <param name="clientIdentity">Client authentication identity.</param>
|
|
/// <param name="cancellationToken">Cancellation token.</param>
|
|
/// <returns>Opened gateway session.</returns>
|
|
public async Task<GatewaySession> OpenSessionAsync(
|
|
SessionOpenRequest request,
|
|
string? clientIdentity,
|
|
CancellationToken cancellationToken)
|
|
{
|
|
ArgumentNullException.ThrowIfNull(request);
|
|
EnsureSessionCapacity();
|
|
|
|
GatewaySession? session = null;
|
|
bool sessionOpenedRecorded = false;
|
|
try
|
|
{
|
|
session = CreateSession(request, clientIdentity);
|
|
if (!_registry.TryAdd(session))
|
|
{
|
|
throw new SessionManagerException(
|
|
SessionManagerErrorCode.OpenFailed,
|
|
$"Session id collision while opening session {session.SessionId}.");
|
|
}
|
|
|
|
session.TransitionTo(SessionState.StartingWorker);
|
|
IWorkerClient workerClient = await _workerClientFactory
|
|
.CreateAsync(session, cancellationToken)
|
|
.ConfigureAwait(false);
|
|
|
|
session.AttachWorkerClient(workerClient);
|
|
session.MarkReady();
|
|
_metrics.SessionOpened();
|
|
sessionOpenedRecorded = true;
|
|
|
|
await TryAutoSubscribeAlarmsAsync(session, cancellationToken).ConfigureAwait(false);
|
|
|
|
return session;
|
|
}
|
|
catch (Exception exception)
|
|
{
|
|
session?.MarkFaulted(exception.Message);
|
|
if (session is not null)
|
|
{
|
|
_registry.TryRemove(session.SessionId, out _);
|
|
await session.DisposeAsync().ConfigureAwait(false);
|
|
}
|
|
|
|
// If SessionOpened() already incremented the open-session gauge,
|
|
// a failure after that point (e.g. auto-subscribe rejection) must
|
|
// decrement it again so mxgateway.sessions.open does not leak.
|
|
if (sessionOpenedRecorded)
|
|
{
|
|
_metrics.SessionRemoved();
|
|
}
|
|
|
|
ReleaseSessionSlot();
|
|
_metrics.Fault(SessionManagerErrorCode.OpenFailed.ToString());
|
|
_logger.LogWarning(
|
|
exception,
|
|
"Failed to open gateway session {SessionId}.",
|
|
session?.SessionId ?? "<not-created>");
|
|
|
|
throw new SessionManagerException(
|
|
SessionManagerErrorCode.OpenFailed,
|
|
session is null ? "Failed to create session." : $"Failed to open session {session.SessionId}.",
|
|
exception);
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Attempts to retrieve a session by ID.
|
|
/// </summary>
|
|
/// <param name="sessionId">Session identifier.</param>
|
|
/// <param name="session">The session if found.</param>
|
|
/// <returns>True if session found; otherwise false.</returns>
|
|
public bool TryGetSession(
|
|
string sessionId,
|
|
out GatewaySession session)
|
|
{
|
|
return _registry.TryGet(sessionId, out session);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Invokes a worker command on a session asynchronously.
|
|
/// </summary>
|
|
/// <param name="sessionId">Session identifier.</param>
|
|
/// <param name="command">Worker command.</param>
|
|
/// <param name="cancellationToken">Cancellation token.</param>
|
|
/// <returns>Command reply.</returns>
|
|
public async Task<WorkerCommandReply> InvokeAsync(
|
|
string sessionId,
|
|
WorkerCommand command,
|
|
CancellationToken cancellationToken)
|
|
{
|
|
GatewaySession session = GetRequiredSession(sessionId);
|
|
|
|
try
|
|
{
|
|
return await session.InvokeAsync(command, cancellationToken).ConfigureAwait(false);
|
|
}
|
|
catch (SessionManagerException)
|
|
{
|
|
throw;
|
|
}
|
|
catch (Exception exception)
|
|
{
|
|
if (session.WorkerClient?.State == WorkerClientState.Faulted)
|
|
{
|
|
session.MarkFaulted(exception.Message);
|
|
}
|
|
|
|
throw;
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Reads events from a session's event stream asynchronously.
|
|
/// </summary>
|
|
/// <param name="sessionId">Session identifier.</param>
|
|
/// <param name="cancellationToken">Cancellation token.</param>
|
|
/// <returns>Async enumerable of worker events.</returns>
|
|
public IAsyncEnumerable<WorkerEvent> ReadEventsAsync(
|
|
string sessionId,
|
|
CancellationToken cancellationToken)
|
|
{
|
|
GatewaySession session = GetRequiredSession(sessionId);
|
|
|
|
return session.ReadEventsAsync(cancellationToken);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Closes a gateway session asynchronously.
|
|
/// </summary>
|
|
/// <param name="sessionId">Session identifier.</param>
|
|
/// <param name="cancellationToken">Cancellation token.</param>
|
|
/// <returns>Session close result.</returns>
|
|
public async Task<SessionCloseResult> CloseSessionAsync(
|
|
string sessionId,
|
|
CancellationToken cancellationToken)
|
|
{
|
|
GatewaySession session = GetRequiredSession(sessionId);
|
|
SessionCloseResult result = await CloseSessionCoreAsync(
|
|
session,
|
|
DefaultCloseReason,
|
|
cancellationToken).ConfigureAwait(false);
|
|
|
|
return result;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Closes all sessions with expired leases asynchronously.
|
|
/// </summary>
|
|
/// <param name="now">Current time for lease expiration check.</param>
|
|
/// <param name="cancellationToken">Cancellation token.</param>
|
|
/// <returns>Count of sessions closed.</returns>
|
|
public async Task<int> CloseExpiredLeasesAsync(
|
|
DateTimeOffset now,
|
|
CancellationToken cancellationToken)
|
|
{
|
|
int closedCount = 0;
|
|
foreach (GatewaySession session in _registry.Snapshot())
|
|
{
|
|
if (!session.IsLeaseExpired(now))
|
|
{
|
|
continue;
|
|
}
|
|
|
|
await CloseSessionCoreAsync(session, LeaseExpiredReason, cancellationToken).ConfigureAwait(false);
|
|
closedCount++;
|
|
}
|
|
|
|
return closedCount;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Shuts down all active sessions gracefully asynchronously.
|
|
/// </summary>
|
|
/// <param name="cancellationToken">Cancellation token.</param>
|
|
/// <returns>Completed task.</returns>
|
|
public async Task ShutdownAsync(CancellationToken cancellationToken)
|
|
{
|
|
foreach (GatewaySession session in _registry.Snapshot())
|
|
{
|
|
try
|
|
{
|
|
await CloseSessionCoreAsync(session, GatewayShutdownReason, cancellationToken).ConfigureAwait(false);
|
|
}
|
|
catch (Exception exception)
|
|
{
|
|
_logger.LogWarning(
|
|
exception,
|
|
"Graceful shutdown failed for session {SessionId}; killing worker.",
|
|
session.SessionId);
|
|
if (_registry.TryGet(session.SessionId, out _))
|
|
{
|
|
session.KillWorker(GatewayShutdownReason);
|
|
await RemoveSessionAsync(session).ConfigureAwait(false);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
private async Task<SessionCloseResult> CloseSessionCoreAsync(
|
|
GatewaySession session,
|
|
string reason,
|
|
CancellationToken cancellationToken)
|
|
{
|
|
bool wasClosed = session.State == SessionState.Closed;
|
|
try
|
|
{
|
|
SessionCloseResult result = await session.CloseAsync(reason, cancellationToken).ConfigureAwait(false);
|
|
if (!wasClosed && !result.AlreadyClosed)
|
|
{
|
|
_metrics.SessionClosed();
|
|
}
|
|
|
|
await RemoveSessionAsync(session).ConfigureAwait(false);
|
|
return result;
|
|
}
|
|
catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested)
|
|
{
|
|
throw;
|
|
}
|
|
catch (SessionCloseStartedException exception)
|
|
{
|
|
session.MarkFaulted(exception.Message);
|
|
if (!wasClosed)
|
|
{
|
|
_metrics.SessionRemoved();
|
|
}
|
|
|
|
_metrics.Fault(SessionManagerErrorCode.CloseFailed.ToString());
|
|
await RemoveSessionAsync(session).ConfigureAwait(false);
|
|
throw new SessionManagerException(
|
|
SessionManagerErrorCode.CloseFailed,
|
|
$"Failed to close session {session.SessionId}.",
|
|
exception);
|
|
}
|
|
}
|
|
|
|
private GatewaySession GetRequiredSession(string sessionId)
|
|
{
|
|
if (!_registry.TryGet(sessionId, out GatewaySession session))
|
|
{
|
|
throw new SessionManagerException(
|
|
SessionManagerErrorCode.SessionNotFound,
|
|
$"Session {sessionId} was not found.");
|
|
}
|
|
|
|
return session;
|
|
}
|
|
|
|
private void EnsureSessionCapacity()
|
|
{
|
|
if (!_sessionSlots.Wait(0))
|
|
{
|
|
throw new SessionManagerException(
|
|
SessionManagerErrorCode.SessionLimitExceeded,
|
|
$"Gateway session limit {_options.Sessions.MaxSessions} has been reached.");
|
|
}
|
|
}
|
|
|
|
private async Task RemoveSessionAsync(GatewaySession session)
|
|
{
|
|
if (!_registry.TryRemove(session.SessionId, out GatewaySession? removedSession))
|
|
{
|
|
return;
|
|
}
|
|
|
|
_metrics.RemoveSessionEvents(session.SessionId);
|
|
ReleaseSessionSlot();
|
|
await removedSession.DisposeAsync().ConfigureAwait(false);
|
|
}
|
|
|
|
private void ReleaseSessionSlot()
|
|
{
|
|
try
|
|
{
|
|
_sessionSlots.Release();
|
|
}
|
|
catch (SemaphoreFullException)
|
|
{
|
|
}
|
|
}
|
|
|
|
private GatewaySession CreateSession(
|
|
SessionOpenRequest request,
|
|
string? clientIdentity)
|
|
{
|
|
string sessionId = CreateSessionId();
|
|
string backendName = string.IsNullOrWhiteSpace(request.RequestedBackend)
|
|
? GatewayContractInfo.DefaultBackendName
|
|
: request.RequestedBackend!;
|
|
TimeSpan commandTimeout = ResolveCommandTimeout(request.CommandTimeout);
|
|
TimeSpan startupTimeout = TimeSpan.FromSeconds(_options.Worker.StartupTimeoutSeconds);
|
|
TimeSpan shutdownTimeout = TimeSpan.FromSeconds(_options.Worker.ShutdownTimeoutSeconds);
|
|
TimeSpan leaseDuration = TimeSpan.FromSeconds(_options.Sessions.DefaultLeaseSeconds);
|
|
string pipeName = $"mxaccess-gateway-{Environment.ProcessId}-{sessionId}";
|
|
string nonce = CreateNonce();
|
|
DateTimeOffset openedAt = _timeProvider.GetUtcNow();
|
|
string clientCorrelationId = CreateClientCorrelationId(request.ClientSessionName, sessionId);
|
|
|
|
return new GatewaySession(
|
|
sessionId,
|
|
backendName,
|
|
pipeName,
|
|
nonce,
|
|
clientIdentity,
|
|
request.ClientSessionName,
|
|
clientCorrelationId,
|
|
commandTimeout,
|
|
startupTimeout,
|
|
shutdownTimeout,
|
|
leaseDuration,
|
|
openedAt);
|
|
}
|
|
|
|
private static string CreateClientCorrelationId(
|
|
string? clientSessionName,
|
|
string sessionId)
|
|
{
|
|
string clientName = string.IsNullOrWhiteSpace(clientSessionName)
|
|
? "client"
|
|
: clientSessionName!;
|
|
|
|
return $"{clientName}-{sessionId}";
|
|
}
|
|
|
|
private TimeSpan ResolveCommandTimeout(Duration? requestedTimeout)
|
|
{
|
|
if (requestedTimeout is null)
|
|
{
|
|
return TimeSpan.FromSeconds(_options.Sessions.DefaultCommandTimeoutSeconds);
|
|
}
|
|
|
|
TimeSpan timeout = requestedTimeout.ToTimeSpan();
|
|
return timeout <= TimeSpan.Zero
|
|
? TimeSpan.FromSeconds(_options.Sessions.DefaultCommandTimeoutSeconds)
|
|
: timeout;
|
|
}
|
|
|
|
private static string CreateSessionId()
|
|
{
|
|
return $"session-{Guid.NewGuid():N}";
|
|
}
|
|
|
|
private static string CreateNonce()
|
|
{
|
|
Span<byte> bytes = stackalloc byte[32];
|
|
RandomNumberGenerator.Fill(bytes);
|
|
|
|
return Convert.ToBase64String(bytes);
|
|
}
|
|
|
|
/// <summary>
|
|
/// If <c>Alarms.Enabled</c> is configured, issue a
|
|
/// <c>SubscribeAlarmsCommand</c> on the freshly-Ready session so the
|
|
/// worker's wnwrap consumer starts polling. Failure handling is
|
|
/// governed by <c>Alarms.RequireSubscribeOnOpen</c>:
|
|
/// <list type="bullet">
|
|
/// <item><description><c>true</c> — propagate the failure to fault the session.</description></item>
|
|
/// <item><description><c>false</c> (default) — log a warning and let the session continue serving data subscriptions.</description></item>
|
|
/// </list>
|
|
/// </summary>
|
|
private async Task TryAutoSubscribeAlarmsAsync(
|
|
GatewaySession session,
|
|
CancellationToken cancellationToken)
|
|
{
|
|
AlarmsOptions alarms = _options.Alarms;
|
|
if (!alarms.Enabled) return;
|
|
|
|
string subscription = ResolveAlarmSubscription(alarms);
|
|
if (string.IsNullOrWhiteSpace(subscription))
|
|
{
|
|
const string diagnostic =
|
|
"Alarms.Enabled is true but no SubscriptionExpression / DefaultArea is configured.";
|
|
if (alarms.RequireSubscribeOnOpen)
|
|
{
|
|
throw new SessionManagerException(
|
|
SessionManagerErrorCode.OpenFailed, diagnostic);
|
|
}
|
|
_logger.LogWarning(
|
|
"Auto-subscribe skipped for session {SessionId}: {Diagnostic}",
|
|
session.SessionId, diagnostic);
|
|
return;
|
|
}
|
|
|
|
WorkerCommand command = new WorkerCommand
|
|
{
|
|
Command = new MxCommand
|
|
{
|
|
Kind = MxCommandKind.SubscribeAlarms,
|
|
SubscribeAlarms = new SubscribeAlarmsCommand
|
|
{
|
|
SubscriptionExpression = subscription,
|
|
},
|
|
},
|
|
EnqueueTimestamp = Timestamp.FromDateTimeOffset(_timeProvider.GetUtcNow()),
|
|
};
|
|
|
|
try
|
|
{
|
|
WorkerCommandReply reply = await session.InvokeAsync(command, cancellationToken)
|
|
.ConfigureAwait(false);
|
|
ProtocolStatusCode? code = reply.Reply?.ProtocolStatus?.Code;
|
|
if (code != ProtocolStatusCode.Ok)
|
|
{
|
|
string diagnostic = reply.Reply?.DiagnosticMessage
|
|
?? reply.Reply?.ProtocolStatus?.Message
|
|
?? "Worker rejected SubscribeAlarms.";
|
|
if (alarms.RequireSubscribeOnOpen)
|
|
{
|
|
throw new SessionManagerException(
|
|
SessionManagerErrorCode.OpenFailed,
|
|
$"Auto-subscribe failed for session {session.SessionId}: {diagnostic}");
|
|
}
|
|
_logger.LogWarning(
|
|
"Auto-subscribe failed for session {SessionId} (status {StatusCode}): {Diagnostic}",
|
|
session.SessionId, code, diagnostic);
|
|
return;
|
|
}
|
|
_logger.LogInformation(
|
|
"Alarm auto-subscribe succeeded for session {SessionId} on {Subscription}.",
|
|
session.SessionId, subscription);
|
|
}
|
|
catch (SessionManagerException)
|
|
{
|
|
throw;
|
|
}
|
|
catch (Exception ex) when (!alarms.RequireSubscribeOnOpen)
|
|
{
|
|
_logger.LogWarning(
|
|
ex,
|
|
"Auto-subscribe threw for session {SessionId} on {Subscription}; alarm path remains inactive.",
|
|
session.SessionId, subscription);
|
|
}
|
|
}
|
|
|
|
private static string ResolveAlarmSubscription(AlarmsOptions alarms)
|
|
{
|
|
if (!string.IsNullOrWhiteSpace(alarms.SubscriptionExpression))
|
|
{
|
|
return alarms.SubscriptionExpression;
|
|
}
|
|
if (!string.IsNullOrWhiteSpace(alarms.DefaultArea))
|
|
{
|
|
return $@"\\{Environment.MachineName}\Galaxy!{alarms.DefaultArea}";
|
|
}
|
|
return string.Empty;
|
|
}
|
|
}
|