Resolve Server-044..050: KillWorker accounting + admin service hardening
Server-044 KillWorkerAsync catch path now calls _metrics.SessionRemoved
so the open-session gauge does not leak when KillWorker throws.
Server-045 KillWorkerAsync routes through a new
GatewaySession.KillWorkerWithCloseGateAsync that takes the
per-session close lock, so concurrent kills count SessionsClosed
exactly once.
Server-046 CloseSessionCoreAsync's SessionCloseStartedException branch and
ShutdownAsync's kill fallback both increment SessionsClosed (not
just the gauge), so the counter and gauge stay consistent.
Server-047 ApiKeysPage.ConfirmPendingAsync holds PendingAction across the
awaited action and clears it in finally, matching the sessions
pages.
Server-048 Closed: the 044/045 regression tests cover the previously-
untested kill paths.
Server-049 IDashboardSessionAdminService + DashboardSessionAdminService
now carry XML docs that pin the Admin gate, missing-session
return-Fail semantics, and the dashboard-admin-kill reason.
Server-050 CloseSessionAsync and KillWorkerAsync catch unexpected
exceptions after the SessionManagerException catches and return
a friendly Fail; OperationCanceledException tied to the caller
token still propagates.
All resolved at 2026-05-24; 503/503 gateway tests pass.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -328,9 +328,18 @@ else
|
||||
return;
|
||||
}
|
||||
|
||||
// Server-047: align the pending-action lifecycle with SessionsPage / SessionDetailsPage —
|
||||
// hold PendingAction while the awaited action runs so the shared ConfirmDialog can render
|
||||
// its in-flight (IsBusy) state, then clear in finally regardless of outcome.
|
||||
Func<System.Security.Claims.ClaimsPrincipal, Task<DashboardApiKeyManagementResult>> action = PendingAction.Action;
|
||||
PendingAction = null;
|
||||
await RunManagementActionAsync(action).ConfigureAwait(false);
|
||||
try
|
||||
{
|
||||
await RunManagementActionAsync(action).ConfigureAwait(false);
|
||||
}
|
||||
finally
|
||||
{
|
||||
PendingAction = null;
|
||||
}
|
||||
}
|
||||
|
||||
private sealed record PendingConfirm(
|
||||
|
||||
@@ -5,6 +5,19 @@ using ZB.MOM.WW.MxGateway.Server.Sessions;
|
||||
|
||||
namespace ZB.MOM.WW.MxGateway.Server.Dashboard;
|
||||
|
||||
/// <summary>
|
||||
/// Default implementation of <see cref="IDashboardSessionAdminService"/>: gates
|
||||
/// destructive session actions on the <see cref="DashboardRoles.Admin"/> role,
|
||||
/// audit-logs successful operations, and converts <see cref="SessionManagerException"/>
|
||||
/// (and any other unexpected exceptions) into <see cref="DashboardSessionAdminResult.Fail(string)"/>
|
||||
/// so the Blazor pages never see a raw exception.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// The constant <c>dashboard-admin-kill</c> is the reason passed to
|
||||
/// <see cref="ISessionManager.KillWorkerAsync"/> and forwarded as the
|
||||
/// <c>reason</c> tag on the <c>mxgateway.workers.killed</c> counter and in
|
||||
/// the worker-kill audit log entries.
|
||||
/// </remarks>
|
||||
public sealed class DashboardSessionAdminService(
|
||||
ISessionManager sessionManager,
|
||||
IHttpContextAccessor httpContextAccessor,
|
||||
@@ -16,6 +29,7 @@ public sealed class DashboardSessionAdminService(
|
||||
private readonly ILogger<DashboardSessionAdminService> _logger =
|
||||
logger ?? NullLogger<DashboardSessionAdminService>.Instance;
|
||||
|
||||
/// <inheritdoc />
|
||||
public bool CanManage(ClaimsPrincipal user)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(user);
|
||||
@@ -24,6 +38,7 @@ public sealed class DashboardSessionAdminService(
|
||||
&& user.IsInRole(DashboardRoles.Admin);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<DashboardSessionAdminResult> CloseSessionAsync(
|
||||
ClaimsPrincipal user,
|
||||
string sessionId,
|
||||
@@ -72,8 +87,27 @@ public sealed class DashboardSessionAdminService(
|
||||
return DashboardSessionAdminResult.Fail(
|
||||
$"Close failed: {exception.Message}");
|
||||
}
|
||||
catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested)
|
||||
{
|
||||
throw;
|
||||
}
|
||||
catch (Exception exception)
|
||||
{
|
||||
// Server-050: any non-SessionManagerException (e.g. an IOException or
|
||||
// InvalidOperationException from the session DisposeAsync / pipe teardown
|
||||
// path) used to propagate raw into Blazor's error boundary. Convert it to
|
||||
// a friendly failure so the Razor pages see only DashboardSessionAdminResult.
|
||||
_logger.LogWarning(
|
||||
exception,
|
||||
"Dashboard admin {Actor} close failed unexpectedly for session {SessionId}.",
|
||||
actor,
|
||||
sessionId);
|
||||
return DashboardSessionAdminResult.Fail(
|
||||
$"Close failed unexpectedly for session {sessionId}. See the gateway log for details.");
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<DashboardSessionAdminResult> KillWorkerAsync(
|
||||
ClaimsPrincipal user,
|
||||
string sessionId,
|
||||
@@ -122,6 +156,26 @@ public sealed class DashboardSessionAdminService(
|
||||
return DashboardSessionAdminResult.Fail(
|
||||
$"Kill failed: {exception.Message}");
|
||||
}
|
||||
catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested)
|
||||
{
|
||||
throw;
|
||||
}
|
||||
catch (Exception exception)
|
||||
{
|
||||
// Server-050: any non-SessionManagerException (e.g. an IOException from
|
||||
// worker pipe teardown surfacing through session.DisposeAsync, or an
|
||||
// InvalidOperationException from a corrupted worker handle) used to
|
||||
// propagate raw into Blazor's error boundary. Convert it to a friendly
|
||||
// failure so the page renders the ResultMessage rather than the circuit
|
||||
// error page.
|
||||
_logger.LogWarning(
|
||||
exception,
|
||||
"Dashboard admin {Actor} kill failed unexpectedly for session {SessionId}.",
|
||||
actor,
|
||||
sessionId);
|
||||
return DashboardSessionAdminResult.Fail(
|
||||
$"Kill failed unexpectedly for session {sessionId}. See the gateway log for details.");
|
||||
}
|
||||
}
|
||||
|
||||
private static string ResolveActor(ClaimsPrincipal user)
|
||||
|
||||
@@ -2,15 +2,82 @@ using System.Security.Claims;
|
||||
|
||||
namespace ZB.MOM.WW.MxGateway.Server.Dashboard;
|
||||
|
||||
/// <summary>
|
||||
/// Dashboard surface for the destructive session-management actions —
|
||||
/// Close (graceful shutdown) and Kill (force-terminate) — exposed by the
|
||||
/// Admin role.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// The dashboard binds the destructive Close/Kill UI to this service so
|
||||
/// the underlying <see cref="Sessions.ISessionManager"/> calls flow through
|
||||
/// a single audited and role-gated entry point. All operations are gated
|
||||
/// by <see cref="CanManage"/>; non-Admin callers are rejected with a
|
||||
/// <c>Succeeded = false</c> result rather than throwing. Missing sessions
|
||||
/// also surface as <see cref="DashboardSessionAdminResult.Fail(string)"/> so
|
||||
/// Razor pages can render the message without an error boundary. Each
|
||||
/// successful call is logged at Information including the acting user
|
||||
/// (from <see cref="ClaimsPrincipal.Identity"/>'s name) and the remote
|
||||
/// address resolved from <see cref="IHttpContextAccessor"/>.
|
||||
/// </remarks>
|
||||
public interface IDashboardSessionAdminService
|
||||
{
|
||||
/// <summary>
|
||||
/// Returns whether the given principal is authorized to perform
|
||||
/// destructive session-management actions.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// Requires <see cref="System.Security.Principal.IIdentity.IsAuthenticated"/>
|
||||
/// to be true and membership in the
|
||||
/// <see cref="DashboardRoles.Admin"/> role. Pages typically gate the
|
||||
/// Close/Kill buttons on this value at render time so non-Admin
|
||||
/// viewers never see them.
|
||||
/// </remarks>
|
||||
/// <param name="user">Caller principal.</param>
|
||||
/// <returns><c>true</c> when the caller may close or kill sessions; otherwise <c>false</c>.</returns>
|
||||
bool CanManage(ClaimsPrincipal user);
|
||||
|
||||
/// <summary>
|
||||
/// Closes the given session gracefully (worker is given the configured
|
||||
/// shutdown grace period before being terminated).
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// Returns <see cref="DashboardSessionAdminResult.Fail(string)"/>
|
||||
/// when the caller is not Admin, when the session id is blank, or when
|
||||
/// <see cref="Sessions.ISessionManager.CloseSessionAsync"/> raises a
|
||||
/// <see cref="Sessions.SessionManagerException"/> (including the
|
||||
/// <see cref="Sessions.SessionManagerErrorCode.SessionNotFound"/>
|
||||
/// case). Successful closes are audit-logged with the caller name,
|
||||
/// session id, and remote address.
|
||||
/// </remarks>
|
||||
/// <param name="user">Caller principal.</param>
|
||||
/// <param name="sessionId">Session identifier to close.</param>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
/// <returns>Result describing success/failure and a user-facing message.</returns>
|
||||
Task<DashboardSessionAdminResult> CloseSessionAsync(
|
||||
ClaimsPrincipal user,
|
||||
string sessionId,
|
||||
CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>
|
||||
/// Force-terminates the worker process backing the given session without
|
||||
/// attempting a graceful shutdown.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// Invoked from the dashboard Kill button. Uses the
|
||||
/// <c>dashboard-admin-kill</c> reason constant — that string reaches
|
||||
/// the audit log and the <c>mxgateway.workers.killed</c> metric tag.
|
||||
/// Returns <see cref="DashboardSessionAdminResult.Fail(string)"/> for
|
||||
/// non-Admin callers, blank session ids, or any
|
||||
/// <see cref="Sessions.SessionManagerException"/> from the underlying
|
||||
/// manager (the <see cref="Sessions.SessionManagerErrorCode.SessionNotFound"/>
|
||||
/// case is recognized and reported as "not found"). Successful kills
|
||||
/// are audit-logged with the caller name, session id, and remote
|
||||
/// address.
|
||||
/// </remarks>
|
||||
/// <param name="user">Caller principal.</param>
|
||||
/// <param name="sessionId">Session identifier to kill.</param>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
/// <returns>Result describing success/failure and a user-facing message.</returns>
|
||||
Task<DashboardSessionAdminResult> KillWorkerAsync(
|
||||
ClaimsPrincipal user,
|
||||
string sessionId,
|
||||
|
||||
@@ -840,6 +840,45 @@ public sealed class GatewaySession
|
||||
TransitionTo(SessionState.Closed);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Terminates the worker process immediately while holding the per-session
|
||||
/// close lock so concurrent close/kill callers serialize. Returns the
|
||||
/// session state observed at the start of the call so the caller can
|
||||
/// dedup metric accounting (e.g. only record <c>SessionClosed</c> when
|
||||
/// the session was not already closed).
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// Mirrors <see cref="CloseAsync"/>'s use of <c>_closeLock</c> so that
|
||||
/// a Close in flight from one caller and a Kill from another do not
|
||||
/// race on the "was the session already closed" observation that
|
||||
/// drives metric increments (Server-045).
|
||||
/// </remarks>
|
||||
/// <param name="reason">Reason for killing the worker.</param>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
/// <returns><c>true</c> if the session was already <see cref="SessionState.Closed"/> when the lock was acquired; otherwise <c>false</c>.</returns>
|
||||
public async ValueTask<bool> KillWorkerWithCloseGateAsync(
|
||||
string reason,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
await _closeLock.WaitAsync(cancellationToken).ConfigureAwait(false);
|
||||
try
|
||||
{
|
||||
bool wasClosed;
|
||||
lock (_syncRoot)
|
||||
{
|
||||
wasClosed = _state == SessionState.Closed;
|
||||
}
|
||||
|
||||
_workerClient?.Kill(reason);
|
||||
TransitionTo(SessionState.Closed);
|
||||
return wasClosed;
|
||||
}
|
||||
finally
|
||||
{
|
||||
_closeLock.Release();
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Disposes the session and frees associated resources.
|
||||
/// </summary>
|
||||
|
||||
@@ -222,16 +222,26 @@ public sealed class SessionManager : ISessionManager
|
||||
cancellationToken.ThrowIfCancellationRequested();
|
||||
|
||||
GatewaySession session = GetRequiredSession(sessionId);
|
||||
bool wasClosed = session.State == SessionState.Closed;
|
||||
|
||||
// Serialize concurrent kill/close attempts on this session by routing through the
|
||||
// per-session close lock (Server-045). Returns whether the session was already in
|
||||
// Closed state when the lock was acquired so the metric counter is incremented at
|
||||
// most once across concurrent callers.
|
||||
bool wasClosed;
|
||||
try
|
||||
{
|
||||
session.KillWorker(reason);
|
||||
wasClosed = await session.KillWorkerWithCloseGateAsync(reason, cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
catch (Exception exception)
|
||||
{
|
||||
session.MarkFaulted(exception.Message);
|
||||
_metrics.Fault(SessionManagerErrorCode.CloseFailed.ToString());
|
||||
|
||||
// Server-044: the open-session gauge was incremented in OpenSessionAsync;
|
||||
// every session reaching KillWorkerAsync had SessionOpened recorded. If the
|
||||
// kill path throws, decrement the gauge here so mxgateway.sessions.open
|
||||
// does not leak — mirroring the Server-006 fix on OpenSessionAsync.
|
||||
_metrics.SessionRemoved();
|
||||
await RemoveSessionAsync(session).ConfigureAwait(false);
|
||||
throw new SessionManagerException(
|
||||
SessionManagerErrorCode.CloseFailed,
|
||||
@@ -297,10 +307,24 @@ public sealed class SessionManager : ISessionManager
|
||||
exception,
|
||||
"Graceful shutdown failed for session {SessionId}; killing worker.",
|
||||
session.SessionId);
|
||||
|
||||
// Defensive fallback: CloseSessionCoreAsync's inner SessionCloseStartedException
|
||||
// catch normally removes the session and accounts the close (Server-046). The
|
||||
// outer fallback only fires for sessions still in the registry — route through
|
||||
// KillWorkerAsync so the bookkeeping is identical to the dashboard kill path.
|
||||
if (_registry.TryGet(session.SessionId, out _))
|
||||
{
|
||||
session.KillWorker(GatewayShutdownReason);
|
||||
await RemoveSessionAsync(session).ConfigureAwait(false);
|
||||
try
|
||||
{
|
||||
await KillWorkerAsync(session.SessionId, GatewayShutdownReason, cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
catch (SessionManagerException killException)
|
||||
{
|
||||
_logger.LogWarning(
|
||||
killException,
|
||||
"Worker kill fallback failed for session {SessionId}.",
|
||||
session.SessionId);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -332,7 +356,12 @@ public sealed class SessionManager : ISessionManager
|
||||
session.MarkFaulted(exception.Message);
|
||||
if (!wasClosed)
|
||||
{
|
||||
_metrics.SessionRemoved();
|
||||
// Server-046: account the close as a SessionClosed (decrements the open-session
|
||||
// gauge AND increments the sessions.closed counter), not just SessionRemoved.
|
||||
// The session is being removed from the registry below; treating this as a
|
||||
// half-finished close that only decremented the gauge under-counted the closed
|
||||
// counter.
|
||||
_metrics.SessionClosed();
|
||||
}
|
||||
|
||||
_metrics.Fault(SessionManagerErrorCode.CloseFailed.ToString());
|
||||
|
||||
Reference in New Issue
Block a user