Resolve Server-044..050: KillWorker accounting + admin service hardening

Server-044  KillWorkerAsync catch path now calls _metrics.SessionRemoved
            so the open-session gauge does not leak when KillWorker throws.
Server-045  KillWorkerAsync routes through a new
            GatewaySession.KillWorkerWithCloseGateAsync that takes the
            per-session close lock, so concurrent kills count SessionsClosed
            exactly once.
Server-046  CloseSessionCoreAsync's SessionCloseStartedException branch and
            ShutdownAsync's kill fallback both increment SessionsClosed (not
            just the gauge), so the counter and gauge stay consistent.
Server-047  ApiKeysPage.ConfirmPendingAsync holds PendingAction across the
            awaited action and clears it in finally, matching the sessions
            pages.
Server-048  Closed: the 044/045 regression tests cover the previously-
            untested kill paths.
Server-049  IDashboardSessionAdminService + DashboardSessionAdminService
            now carry XML docs that pin the Admin gate, missing-session
            return-Fail semantics, and the dashboard-admin-kill reason.
Server-050  CloseSessionAsync and KillWorkerAsync catch unexpected
            exceptions after the SessionManagerException catches and return
            a friendly Fail; OperationCanceledException tied to the caller
            token still propagates.

All resolved at 2026-05-24; 503/503 gateway tests pass.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Joseph Doherty
2026-05-24 08:49:34 -04:00
parent 6079c62709
commit 4d77279e7e
8 changed files with 403 additions and 16 deletions
@@ -222,16 +222,26 @@ public sealed class SessionManager : ISessionManager
cancellationToken.ThrowIfCancellationRequested();
GatewaySession session = GetRequiredSession(sessionId);
bool wasClosed = session.State == SessionState.Closed;
// Serialize concurrent kill/close attempts on this session by routing through the
// per-session close lock (Server-045). Returns whether the session was already in
// Closed state when the lock was acquired so the metric counter is incremented at
// most once across concurrent callers.
bool wasClosed;
try
{
session.KillWorker(reason);
wasClosed = await session.KillWorkerWithCloseGateAsync(reason, cancellationToken).ConfigureAwait(false);
}
catch (Exception exception)
{
session.MarkFaulted(exception.Message);
_metrics.Fault(SessionManagerErrorCode.CloseFailed.ToString());
// Server-044: the open-session gauge was incremented in OpenSessionAsync;
// every session reaching KillWorkerAsync had SessionOpened recorded. If the
// kill path throws, decrement the gauge here so mxgateway.sessions.open
// does not leak — mirroring the Server-006 fix on OpenSessionAsync.
_metrics.SessionRemoved();
await RemoveSessionAsync(session).ConfigureAwait(false);
throw new SessionManagerException(
SessionManagerErrorCode.CloseFailed,
@@ -297,10 +307,24 @@ public sealed class SessionManager : ISessionManager
exception,
"Graceful shutdown failed for session {SessionId}; killing worker.",
session.SessionId);
// Defensive fallback: CloseSessionCoreAsync's inner SessionCloseStartedException
// catch normally removes the session and accounts the close (Server-046). The
// outer fallback only fires for sessions still in the registry — route through
// KillWorkerAsync so the bookkeeping is identical to the dashboard kill path.
if (_registry.TryGet(session.SessionId, out _))
{
session.KillWorker(GatewayShutdownReason);
await RemoveSessionAsync(session).ConfigureAwait(false);
try
{
await KillWorkerAsync(session.SessionId, GatewayShutdownReason, cancellationToken).ConfigureAwait(false);
}
catch (SessionManagerException killException)
{
_logger.LogWarning(
killException,
"Worker kill fallback failed for session {SessionId}.",
session.SessionId);
}
}
}
}
@@ -332,7 +356,12 @@ public sealed class SessionManager : ISessionManager
session.MarkFaulted(exception.Message);
if (!wasClosed)
{
_metrics.SessionRemoved();
// Server-046: account the close as a SessionClosed (decrements the open-session
// gauge AND increments the sessions.closed counter), not just SessionRemoved.
// The session is being removed from the registry below; treating this as a
// half-finished close that only decremented the gauge under-counted the closed
// counter.
_metrics.SessionClosed();
}
_metrics.Fault(SessionManagerErrorCode.CloseFailed.ToString());