Resolve Server-044..050: KillWorker accounting + admin service hardening

Server-044  KillWorkerAsync catch path now calls _metrics.SessionRemoved
            so the open-session gauge does not leak when KillWorker throws.
Server-045  KillWorkerAsync routes through a new
            GatewaySession.KillWorkerWithCloseGateAsync that takes the
            per-session close lock, so concurrent kills count SessionsClosed
            exactly once.
Server-046  CloseSessionCoreAsync's SessionCloseStartedException branch and
            ShutdownAsync's kill fallback both increment SessionsClosed (not
            just the gauge), so the counter and gauge stay consistent.
Server-047  ApiKeysPage.ConfirmPendingAsync holds PendingAction across the
            awaited action and clears it in finally, matching the sessions
            pages.
Server-048  Closed: the 044/045 regression tests cover the previously-
            untested kill paths.
Server-049  IDashboardSessionAdminService + DashboardSessionAdminService
            now carry XML docs that pin the Admin gate, missing-session
            return-Fail semantics, and the dashboard-admin-kill reason.
Server-050  CloseSessionAsync and KillWorkerAsync catch unexpected
            exceptions after the SessionManagerException catches and return
            a friendly Fail; OperationCanceledException tied to the caller
            token still propagates.

All resolved at 2026-05-24; 503/503 gateway tests pass.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Joseph Doherty
2026-05-24 08:49:34 -04:00
parent 6079c62709
commit 4d77279e7e
8 changed files with 403 additions and 16 deletions
@@ -328,9 +328,18 @@ else
return;
}
// Server-047: align the pending-action lifecycle with SessionsPage / SessionDetailsPage —
// hold PendingAction while the awaited action runs so the shared ConfirmDialog can render
// its in-flight (IsBusy) state, then clear in finally regardless of outcome.
Func<System.Security.Claims.ClaimsPrincipal, Task<DashboardApiKeyManagementResult>> action = PendingAction.Action;
PendingAction = null;
await RunManagementActionAsync(action).ConfigureAwait(false);
try
{
await RunManagementActionAsync(action).ConfigureAwait(false);
}
finally
{
PendingAction = null;
}
}
private sealed record PendingConfirm(
@@ -5,6 +5,19 @@ using ZB.MOM.WW.MxGateway.Server.Sessions;
namespace ZB.MOM.WW.MxGateway.Server.Dashboard;
/// <summary>
/// Default implementation of <see cref="IDashboardSessionAdminService"/>: gates
/// destructive session actions on the <see cref="DashboardRoles.Admin"/> role,
/// audit-logs successful operations, and converts <see cref="SessionManagerException"/>
/// (and any other unexpected exceptions) into <see cref="DashboardSessionAdminResult.Fail(string)"/>
/// so the Blazor pages never see a raw exception.
/// </summary>
/// <remarks>
/// The constant <c>dashboard-admin-kill</c> is the reason passed to
/// <see cref="ISessionManager.KillWorkerAsync"/> and forwarded as the
/// <c>reason</c> tag on the <c>mxgateway.workers.killed</c> counter and in
/// the worker-kill audit log entries.
/// </remarks>
public sealed class DashboardSessionAdminService(
ISessionManager sessionManager,
IHttpContextAccessor httpContextAccessor,
@@ -16,6 +29,7 @@ public sealed class DashboardSessionAdminService(
private readonly ILogger<DashboardSessionAdminService> _logger =
logger ?? NullLogger<DashboardSessionAdminService>.Instance;
/// <inheritdoc />
public bool CanManage(ClaimsPrincipal user)
{
ArgumentNullException.ThrowIfNull(user);
@@ -24,6 +38,7 @@ public sealed class DashboardSessionAdminService(
&& user.IsInRole(DashboardRoles.Admin);
}
/// <inheritdoc />
public async Task<DashboardSessionAdminResult> CloseSessionAsync(
ClaimsPrincipal user,
string sessionId,
@@ -72,8 +87,27 @@ public sealed class DashboardSessionAdminService(
return DashboardSessionAdminResult.Fail(
$"Close failed: {exception.Message}");
}
catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested)
{
throw;
}
catch (Exception exception)
{
// Server-050: any non-SessionManagerException (e.g. an IOException or
// InvalidOperationException from the session DisposeAsync / pipe teardown
// path) used to propagate raw into Blazor's error boundary. Convert it to
// a friendly failure so the Razor pages see only DashboardSessionAdminResult.
_logger.LogWarning(
exception,
"Dashboard admin {Actor} close failed unexpectedly for session {SessionId}.",
actor,
sessionId);
return DashboardSessionAdminResult.Fail(
$"Close failed unexpectedly for session {sessionId}. See the gateway log for details.");
}
}
/// <inheritdoc />
public async Task<DashboardSessionAdminResult> KillWorkerAsync(
ClaimsPrincipal user,
string sessionId,
@@ -122,6 +156,26 @@ public sealed class DashboardSessionAdminService(
return DashboardSessionAdminResult.Fail(
$"Kill failed: {exception.Message}");
}
catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested)
{
throw;
}
catch (Exception exception)
{
// Server-050: any non-SessionManagerException (e.g. an IOException from
// worker pipe teardown surfacing through session.DisposeAsync, or an
// InvalidOperationException from a corrupted worker handle) used to
// propagate raw into Blazor's error boundary. Convert it to a friendly
// failure so the page renders the ResultMessage rather than the circuit
// error page.
_logger.LogWarning(
exception,
"Dashboard admin {Actor} kill failed unexpectedly for session {SessionId}.",
actor,
sessionId);
return DashboardSessionAdminResult.Fail(
$"Kill failed unexpectedly for session {sessionId}. See the gateway log for details.");
}
}
private static string ResolveActor(ClaimsPrincipal user)
@@ -2,15 +2,82 @@ using System.Security.Claims;
namespace ZB.MOM.WW.MxGateway.Server.Dashboard;
/// <summary>
/// Dashboard surface for the destructive session-management actions —
/// Close (graceful shutdown) and Kill (force-terminate) — exposed by the
/// Admin role.
/// </summary>
/// <remarks>
/// The dashboard binds the destructive Close/Kill UI to this service so
/// the underlying <see cref="Sessions.ISessionManager"/> calls flow through
/// a single audited and role-gated entry point. All operations are gated
/// by <see cref="CanManage"/>; non-Admin callers are rejected with a
/// <c>Succeeded = false</c> result rather than throwing. Missing sessions
/// also surface as <see cref="DashboardSessionAdminResult.Fail(string)"/> so
/// Razor pages can render the message without an error boundary. Each
/// successful call is logged at Information including the acting user
/// (from <see cref="ClaimsPrincipal.Identity"/>'s name) and the remote
/// address resolved from <see cref="IHttpContextAccessor"/>.
/// </remarks>
public interface IDashboardSessionAdminService
{
/// <summary>
/// Returns whether the given principal is authorized to perform
/// destructive session-management actions.
/// </summary>
/// <remarks>
/// Requires <see cref="System.Security.Principal.IIdentity.IsAuthenticated"/>
/// to be true and membership in the
/// <see cref="DashboardRoles.Admin"/> role. Pages typically gate the
/// Close/Kill buttons on this value at render time so non-Admin
/// viewers never see them.
/// </remarks>
/// <param name="user">Caller principal.</param>
/// <returns><c>true</c> when the caller may close or kill sessions; otherwise <c>false</c>.</returns>
bool CanManage(ClaimsPrincipal user);
/// <summary>
/// Closes the given session gracefully (worker is given the configured
/// shutdown grace period before being terminated).
/// </summary>
/// <remarks>
/// Returns <see cref="DashboardSessionAdminResult.Fail(string)"/>
/// when the caller is not Admin, when the session id is blank, or when
/// <see cref="Sessions.ISessionManager.CloseSessionAsync"/> raises a
/// <see cref="Sessions.SessionManagerException"/> (including the
/// <see cref="Sessions.SessionManagerErrorCode.SessionNotFound"/>
/// case). Successful closes are audit-logged with the caller name,
/// session id, and remote address.
/// </remarks>
/// <param name="user">Caller principal.</param>
/// <param name="sessionId">Session identifier to close.</param>
/// <param name="cancellationToken">Cancellation token.</param>
/// <returns>Result describing success/failure and a user-facing message.</returns>
Task<DashboardSessionAdminResult> CloseSessionAsync(
ClaimsPrincipal user,
string sessionId,
CancellationToken cancellationToken);
/// <summary>
/// Force-terminates the worker process backing the given session without
/// attempting a graceful shutdown.
/// </summary>
/// <remarks>
/// Invoked from the dashboard Kill button. Uses the
/// <c>dashboard-admin-kill</c> reason constant — that string reaches
/// the audit log and the <c>mxgateway.workers.killed</c> metric tag.
/// Returns <see cref="DashboardSessionAdminResult.Fail(string)"/> for
/// non-Admin callers, blank session ids, or any
/// <see cref="Sessions.SessionManagerException"/> from the underlying
/// manager (the <see cref="Sessions.SessionManagerErrorCode.SessionNotFound"/>
/// case is recognized and reported as "not found"). Successful kills
/// are audit-logged with the caller name, session id, and remote
/// address.
/// </remarks>
/// <param name="user">Caller principal.</param>
/// <param name="sessionId">Session identifier to kill.</param>
/// <param name="cancellationToken">Cancellation token.</param>
/// <returns>Result describing success/failure and a user-facing message.</returns>
Task<DashboardSessionAdminResult> KillWorkerAsync(
ClaimsPrincipal user,
string sessionId,
@@ -840,6 +840,45 @@ public sealed class GatewaySession
TransitionTo(SessionState.Closed);
}
/// <summary>
/// Terminates the worker process immediately while holding the per-session
/// close lock so concurrent close/kill callers serialize. Returns the
/// session state observed at the start of the call so the caller can
/// dedup metric accounting (e.g. only record <c>SessionClosed</c> when
/// the session was not already closed).
/// </summary>
/// <remarks>
/// Mirrors <see cref="CloseAsync"/>'s use of <c>_closeLock</c> so that
/// a Close in flight from one caller and a Kill from another do not
/// race on the "was the session already closed" observation that
/// drives metric increments (Server-045).
/// </remarks>
/// <param name="reason">Reason for killing the worker.</param>
/// <param name="cancellationToken">Cancellation token.</param>
/// <returns><c>true</c> if the session was already <see cref="SessionState.Closed"/> when the lock was acquired; otherwise <c>false</c>.</returns>
public async ValueTask<bool> KillWorkerWithCloseGateAsync(
string reason,
CancellationToken cancellationToken)
{
await _closeLock.WaitAsync(cancellationToken).ConfigureAwait(false);
try
{
bool wasClosed;
lock (_syncRoot)
{
wasClosed = _state == SessionState.Closed;
}
_workerClient?.Kill(reason);
TransitionTo(SessionState.Closed);
return wasClosed;
}
finally
{
_closeLock.Release();
}
}
/// <summary>
/// Disposes the session and frees associated resources.
/// </summary>
@@ -222,16 +222,26 @@ public sealed class SessionManager : ISessionManager
cancellationToken.ThrowIfCancellationRequested();
GatewaySession session = GetRequiredSession(sessionId);
bool wasClosed = session.State == SessionState.Closed;
// Serialize concurrent kill/close attempts on this session by routing through the
// per-session close lock (Server-045). Returns whether the session was already in
// Closed state when the lock was acquired so the metric counter is incremented at
// most once across concurrent callers.
bool wasClosed;
try
{
session.KillWorker(reason);
wasClosed = await session.KillWorkerWithCloseGateAsync(reason, cancellationToken).ConfigureAwait(false);
}
catch (Exception exception)
{
session.MarkFaulted(exception.Message);
_metrics.Fault(SessionManagerErrorCode.CloseFailed.ToString());
// Server-044: the open-session gauge was incremented in OpenSessionAsync;
// every session reaching KillWorkerAsync had SessionOpened recorded. If the
// kill path throws, decrement the gauge here so mxgateway.sessions.open
// does not leak — mirroring the Server-006 fix on OpenSessionAsync.
_metrics.SessionRemoved();
await RemoveSessionAsync(session).ConfigureAwait(false);
throw new SessionManagerException(
SessionManagerErrorCode.CloseFailed,
@@ -297,10 +307,24 @@ public sealed class SessionManager : ISessionManager
exception,
"Graceful shutdown failed for session {SessionId}; killing worker.",
session.SessionId);
// Defensive fallback: CloseSessionCoreAsync's inner SessionCloseStartedException
// catch normally removes the session and accounts the close (Server-046). The
// outer fallback only fires for sessions still in the registry — route through
// KillWorkerAsync so the bookkeeping is identical to the dashboard kill path.
if (_registry.TryGet(session.SessionId, out _))
{
session.KillWorker(GatewayShutdownReason);
await RemoveSessionAsync(session).ConfigureAwait(false);
try
{
await KillWorkerAsync(session.SessionId, GatewayShutdownReason, cancellationToken).ConfigureAwait(false);
}
catch (SessionManagerException killException)
{
_logger.LogWarning(
killException,
"Worker kill fallback failed for session {SessionId}.",
session.SessionId);
}
}
}
}
@@ -332,7 +356,12 @@ public sealed class SessionManager : ISessionManager
session.MarkFaulted(exception.Message);
if (!wasClosed)
{
_metrics.SessionRemoved();
// Server-046: account the close as a SessionClosed (decrements the open-session
// gauge AND increments the sessions.closed counter), not just SessionRemoved.
// The session is being removed from the registry below; treating this as a
// half-finished close that only decremented the gauge under-counted the closed
// counter.
_metrics.SessionClosed();
}
_metrics.Fault(SessionManagerErrorCode.CloseFailed.ToString());
@@ -115,6 +115,52 @@ public sealed class DashboardSessionAdminServiceTests
Assert.True(service.CanManage(CreateUser(DashboardRoles.Admin)));
}
/// <summary>
/// Regression for Server-050: an unexpected (non-<see cref="SessionManagerException"/>)
/// exception from <c>CloseSessionAsync</c> — e.g. an <see cref="InvalidOperationException"/>
/// or <see cref="IOException"/> surfaced from <c>RemoveSessionAsync</c>/<c>DisposeAsync</c> —
/// must be converted to a friendly <see cref="DashboardSessionAdminResult.Fail(string)"/>
/// rather than propagating raw into Blazor's error boundary.
/// </summary>
[Fact]
public async Task CloseSessionAsync_WhenManagerThrowsUnexpected_ReturnsFriendlyFail()
{
FakeSessionManager sessionManager = new()
{
CloseThrowsUnexpected = new InvalidOperationException("unexpected"),
};
DashboardSessionAdminService service = CreateService(sessionManager);
DashboardSessionAdminResult result = await service.CloseSessionAsync(
CreateUser(DashboardRoles.Admin),
"session-1",
CancellationToken.None);
Assert.False(result.Succeeded);
Assert.False(string.IsNullOrWhiteSpace(result.Message));
}
/// <summary>
/// Regression for Server-050: same friendly-fail contract for the Kill path.
/// </summary>
[Fact]
public async Task KillWorkerAsync_WhenManagerThrowsUnexpected_ReturnsFriendlyFail()
{
FakeSessionManager sessionManager = new()
{
KillThrowsUnexpected = new IOException("pipe broken"),
};
DashboardSessionAdminService service = CreateService(sessionManager);
DashboardSessionAdminResult result = await service.KillWorkerAsync(
CreateUser(DashboardRoles.Admin),
"session-1",
CancellationToken.None);
Assert.False(result.Succeeded);
Assert.False(string.IsNullOrWhiteSpace(result.Message));
}
private static DashboardSessionAdminService CreateService(ISessionManager sessionManager)
{
DefaultHttpContext httpContext = new();
@@ -150,6 +196,10 @@ public sealed class DashboardSessionAdminServiceTests
public bool CloseThrowsNotFound { get; init; }
public Exception? CloseThrowsUnexpected { get; init; }
public Exception? KillThrowsUnexpected { get; init; }
public Task<GatewaySession> OpenSessionAsync(
SessionOpenRequest request,
string? clientIdentity,
@@ -194,6 +244,11 @@ public sealed class DashboardSessionAdminServiceTests
$"Session {sessionId} was not found.");
}
if (CloseThrowsUnexpected is not null)
{
throw CloseThrowsUnexpected;
}
return Task.FromResult(new SessionCloseResult(sessionId, SessionState.Closed, AlreadyClosed: false));
}
@@ -205,6 +260,11 @@ public sealed class DashboardSessionAdminServiceTests
KillCount++;
LastKilledSessionId = sessionId;
LastKillReason = reason;
if (KillThrowsUnexpected is not null)
{
throw KillThrowsUnexpected;
}
return Task.FromResult(new SessionCloseResult(sessionId, SessionState.Closed, AlreadyClosed: false));
}
@@ -410,7 +410,10 @@ public sealed class SessionManagerTests
Assert.Equal(1, failingWorkerClient.KillCount);
Assert.Equal(1, failingWorkerClient.DisposeCount);
GatewayMetricsSnapshot snapshot = metrics.GetSnapshot();
Assert.Equal(0, snapshot.SessionsClosed);
// Server-046: a close-that-failed now accounts as SessionClosed (counter += 1) rather
// than SessionRemoved (gauge -= 1, counter unchanged). The session is being removed
// from the registry on this path, so it must show up in the closed count.
Assert.Equal(1, snapshot.SessionsClosed);
Assert.False(snapshot.EventsBySession.ContainsKey(firstSession.SessionId));
Assert.Equal(1, snapshot.OpenSessions);
}
@@ -495,6 +498,110 @@ public sealed class SessionManagerTests
Assert.Equal(SessionManagerErrorCode.SessionNotFound, exception.ErrorCode);
}
/// <summary>
/// Regression for Server-044: when <c>session.KillWorker</c> throws, the catch path must still
/// decrement <c>mxgateway.sessions.open</c> (parity with the Server-006 fix in
/// <c>OpenSessionAsync</c>). Without the fix the gauge leaks one open session per failed kill.
/// </summary>
[Fact]
public async Task KillWorkerAsync_WhenSessionKillThrows_DecrementsOpenSessionGauge()
{
FakeWorkerClient workerClient = new()
{
KillException = new InvalidOperationException("worker kill failed"),
};
using GatewayMetrics metrics = new();
SessionManager manager = CreateManager(
new FakeSessionWorkerClientFactory(workerClient),
metrics: metrics);
GatewaySession session = await manager.OpenSessionAsync(
CreateOpenRequest(),
"client-1",
CancellationToken.None);
Assert.Equal(1, metrics.GetSnapshot().OpenSessions);
SessionManagerException exception = await Assert.ThrowsAsync<SessionManagerException>(
async () => await manager.KillWorkerAsync(session.SessionId, "test-kill", CancellationToken.None));
Assert.Equal(SessionManagerErrorCode.CloseFailed, exception.ErrorCode);
Assert.False(manager.TryGetSession(session.SessionId, out _));
Assert.Equal(0, metrics.GetSnapshot().OpenSessions);
Assert.True(metrics.GetSnapshot().Faults > 0);
}
/// <summary>
/// Regression for Server-045 / Server-048: concurrent kills on the same session must not
/// double-increment <c>mxgateway.sessions.closed</c>. The first kill wins, the second
/// observes <c>wasClosed == true</c> (or a missing session after removal) and short-circuits.
/// </summary>
[Fact]
public async Task KillWorkerAsync_ConcurrentCallsOnSameSession_CountClosedExactlyOnce()
{
FakeWorkerClient workerClient = new();
using GatewayMetrics metrics = new();
SessionManager manager = CreateManager(
new FakeSessionWorkerClientFactory(workerClient),
metrics: metrics);
GatewaySession session = await manager.OpenSessionAsync(
CreateOpenRequest(),
"client-1",
CancellationToken.None);
Task<SessionCloseResult> first = manager.KillWorkerAsync(session.SessionId, "kill-a", CancellationToken.None);
Task<SessionCloseResult> second = Task.Run(async () =>
{
try
{
return await manager.KillWorkerAsync(session.SessionId, "kill-b", CancellationToken.None);
}
catch (SessionManagerException missing) when (missing.ErrorCode == SessionManagerErrorCode.SessionNotFound)
{
return new SessionCloseResult(session.SessionId, SessionState.Closed, AlreadyClosed: true);
}
});
await Task.WhenAll(first, second);
Assert.Equal(1, metrics.GetSnapshot().SessionsClosed);
Assert.Equal(0, metrics.GetSnapshot().OpenSessions);
Assert.False(manager.TryGetSession(session.SessionId, out _));
}
/// <summary>
/// Regression for Server-046: <c>ShutdownAsync</c>'s graceful-close fallback (which calls
/// <c>KillWorker</c> + <c>RemoveSessionAsync</c> when <c>CloseSessionCoreAsync</c> throws)
/// must still account a successful close: both the open-session gauge must drop to zero AND
/// the <c>mxgateway.sessions.closed</c> counter must increment. Without the fix, the
/// graceful-close failure path under-counts the closed counter.
/// </summary>
[Fact]
public async Task ShutdownAsync_WhenSessionCloseThrows_StillDecrementsOpenSessionGaugeAndIncrementsClosedCounter()
{
FakeWorkerClient throwingClient = new()
{
ShutdownException = new InvalidOperationException("worker shutdown failed"),
};
using GatewayMetrics metrics = new();
SessionManager manager = CreateManager(
new FakeSessionWorkerClientFactory(throwingClient),
metrics: metrics);
GatewaySession session = await manager.OpenSessionAsync(
CreateOpenRequest(),
"client-1",
CancellationToken.None);
Assert.Equal(1, metrics.GetSnapshot().OpenSessions);
await manager.ShutdownAsync(CancellationToken.None);
// After shutdown, regardless of whether the graceful close path or the kill fallback ran,
// the open-session gauge must be zero and the closed counter must be incremented.
Assert.Equal(0, metrics.GetSnapshot().OpenSessions);
Assert.Equal(1, metrics.GetSnapshot().SessionsClosed);
Assert.False(manager.TryGetSession(session.SessionId, out _));
}
/// <summary>Verifies that when worker creation fails, the session is removed from the registry.</summary>
[Fact]
public async Task OpenSessionAsync_WhenWorkerCreationFails_RemovesSessionFromRegistry()
@@ -726,6 +833,9 @@ public sealed class SessionManagerTests
/// <summary>Gets the exception to throw when shutdown is called, if any.</summary>
public Exception? ShutdownException { get; init; }
/// <summary>Gets the exception to throw when kill is called, if any.</summary>
public Exception? KillException { get; init; }
/// <summary>Gets a value indicating whether to block shutdown on the fake worker client.</summary>
public bool BlockShutdown { get; init; }
@@ -803,6 +913,11 @@ public sealed class SessionManagerTests
public void Kill(string reason)
{
KillCount++;
if (KillException is not null)
{
throw KillException;
}
State = WorkerClientState.Faulted;
}