Resolve Server-002, -004, -005, -006 code-review findings

Server-002: the gateway never terminated leftover MxGateway.Worker.exe
processes at startup, contradicting gateway.md and CLAUDE.md. Added
IRunningProcessInspector/SystemRunningProcessInspector, OrphanWorkerTerminator,
and OrphanWorkerCleanupHostedService (best-effort, runs before sessions are
accepted); updated gateway.md to describe the implemented behavior.

Server-004: API-key scopes were persisted verbatim with no validation. Added
GatewayScopes.All/IsKnown; the CLI parser and dashboard create path now
reject unknown scope strings.

Server-005: a non-SqlException/InvalidOperationException fault on the initial
Galaxy hierarchy load faulted the BackgroundService. ExecuteAsync now catches
all non-cancellation exceptions on first load and RefreshCoreAsync broadens
its catch so the cache records Stale/Unavailable instead.

Server-006: OpenSessionAsync incremented the open-sessions gauge before
alarm auto-subscribe; an auto-subscribe failure leaked the gauge. The catch
path now calls SessionRemoved() when the gauge was incremented.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Joseph Doherty
2026-05-18 21:31:10 -04:00
parent 5e795aeeb8
commit 1d9e3afadd
18 changed files with 676 additions and 15 deletions
@@ -1,6 +1,7 @@
using System.Security.Claims;
using Microsoft.Data.Sqlite;
using MxGateway.Server.Security.Authentication;
using MxGateway.Server.Security.Authorization;
namespace MxGateway.Server.Dashboard;
@@ -171,6 +172,15 @@ public sealed class DashboardApiKeyManagementService(
return "Display name is required.";
}
string[] unknownScopes = request.Scopes
.Where(scope => !GatewayScopes.IsKnown(scope))
.ToArray();
if (unknownScopes.Length > 0)
{
return $"Unknown scope(s): {string.Join(", ", unknownScopes)}. "
+ $"Valid scopes are: {string.Join(", ", GatewayScopes.All)}.";
}
return null;
}
@@ -1,5 +1,4 @@
using Google.Protobuf.WellKnownTypes;
using Microsoft.Data.SqlClient;
using Microsoft.Extensions.Logging;
using MxGateway.Contracts.Proto.Galaxy;
using MxGateway.Server.Dashboard;
@@ -181,8 +180,13 @@ public sealed class GalaxyHierarchyCache : IGalaxyHierarchyCache
{
throw;
}
catch (Exception exception) when (exception is SqlException or InvalidOperationException)
catch (Exception exception)
{
// Catch every non-cancellation failure — not just SqlException /
// InvalidOperationException. A TimeoutException or Win32Exception
// from connection establishment, or another DbException subtype,
// must still degrade gracefully to Stale/Unavailable and complete
// _firstLoad rather than escape and fault the refresh BackgroundService.
_logger?.LogWarning(exception, "Galaxy hierarchy cache refresh failed.");
GalaxyHierarchyCacheEntry failed = previous with
{
@@ -26,6 +26,15 @@ public sealed class GalaxyHierarchyRefreshService(
{
return;
}
catch (Exception exception)
{
// A transient first-load failure (e.g. a TimeoutException or
// Win32Exception from connection establishment, or a DbException
// subtype the cache does not catch) must not fault this
// BackgroundService and stop the whole gateway. The cache records
// its own Unavailable/Stale status; the periodic tick below retries.
logger.LogWarning(exception, "Initial Galaxy hierarchy cache load failed; will retry on the refresh interval.");
}
using PeriodicTimer timer = new(interval, _timeProvider);
try
@@ -1,3 +1,5 @@
using MxGateway.Server.Security.Authorization;
namespace MxGateway.Server.Security.Authentication;
public static class ApiKeyAdminCommandLineParser
@@ -95,6 +97,12 @@ public static class ApiKeyAdminCommandLineParser
return ApiKeyAdminParseResult.Fail(validationError);
}
string? scopeError = ValidateScopes(kind, scopes);
if (scopeError is not null)
{
return ApiKeyAdminParseResult.Fail(scopeError);
}
return ApiKeyAdminParseResult.Success(new ApiKeyAdminCommand(
Kind: kind,
Json: json,
@@ -152,6 +160,23 @@ public static class ApiKeyAdminCommandLineParser
return null;
}
private static string? ValidateScopes(ApiKeyAdminCommandKind kind, IReadOnlySet<string> scopes)
{
if (kind != ApiKeyAdminCommandKind.CreateKey)
{
return null;
}
string[] unknown = scopes.Where(scope => !GatewayScopes.IsKnown(scope)).ToArray();
if (unknown.Length == 0)
{
return null;
}
return $"Unknown scope(s): {string.Join(", ", unknown)}. "
+ $"Valid scopes are: {string.Join(", ", GatewayScopes.All)}.";
}
private static string KindName(ApiKeyAdminCommandKind kind)
{
return kind switch
@@ -10,4 +10,28 @@ public static class GatewayScopes
public const string EventsRead = "events:read";
public const string MetadataRead = "metadata:read";
public const string Admin = "admin";
/// <summary>
/// The complete catalog of canonical scope strings the gateway authorization
/// resolver recognizes. Key-creation paths (CLI and dashboard) validate requested
/// scopes against this set so a typo or non-canonical name cannot persist a key
/// whose scope strings the resolver never matches.
/// </summary>
public static readonly IReadOnlySet<string> All = new HashSet<string>(
[
SessionOpen,
SessionClose,
InvokeRead,
InvokeWrite,
InvokeSecure,
EventsRead,
MetadataRead,
Admin,
],
System.StringComparer.Ordinal);
/// <summary>Determines whether the supplied scope string is a recognized canonical scope.</summary>
/// <param name="scope">Scope string to check.</param>
/// <returns><see langword="true"/> when the scope is canonical; otherwise <see langword="false"/>.</returns>
public static bool IsKnown(string scope) => All.Contains(scope);
}
@@ -68,6 +68,7 @@ public sealed class SessionManager : ISessionManager
EnsureSessionCapacity();
GatewaySession? session = null;
bool sessionOpenedRecorded = false;
try
{
session = CreateSession(request, clientIdentity);
@@ -86,6 +87,7 @@ public sealed class SessionManager : ISessionManager
session.AttachWorkerClient(workerClient);
session.MarkReady();
_metrics.SessionOpened();
sessionOpenedRecorded = true;
await TryAutoSubscribeAlarmsAsync(session, cancellationToken).ConfigureAwait(false);
@@ -100,6 +102,14 @@ public sealed class SessionManager : ISessionManager
await session.DisposeAsync().ConfigureAwait(false);
}
// If SessionOpened() already incremented the open-session gauge,
// a failure after that point (e.g. auto-subscribe rejection) must
// decrement it again so mxgateway.sessions.open does not leak.
if (sessionOpenedRecorded)
{
_metrics.SessionRemoved();
}
ReleaseSessionSlot();
_metrics.Fault(SessionManagerErrorCode.OpenFailed.ToString());
_logger.LogWarning(
@@ -0,0 +1,29 @@
namespace MxGateway.Server.Workers;
/// <summary>
/// Abstraction over OS process enumeration and termination. Exists so the
/// orphan-worker cleanup logic can be unit-tested without spawning real
/// processes.
/// </summary>
public interface IRunningProcessInspector
{
/// <summary>
/// Enumerates currently running processes whose image name (without the
/// <c>.exe</c> extension) matches <paramref name="processName"/>.
/// </summary>
/// <param name="processName">Process image name to match, without extension.</param>
/// <returns>The matching running processes.</returns>
IReadOnlyList<RunningProcessInfo> GetProcessesByName(string processName);
/// <summary>Forcibly terminates the process with the given identifier.</summary>
/// <param name="processId">Identifier of the process to terminate.</param>
void Kill(int processId);
}
/// <summary>Identifying information for a running process candidate.</summary>
/// <param name="ProcessId">Operating-system process identifier.</param>
/// <param name="ExecutablePath">
/// Fully-qualified path to the process main module, or <see langword="null"/>
/// when it could not be read (e.g. access denied).
/// </param>
public sealed record RunningProcessInfo(int ProcessId, string? ExecutablePath);
@@ -0,0 +1,30 @@
namespace MxGateway.Server.Workers;
/// <summary>
/// Hosted service that terminates leftover MXAccess worker processes once on
/// gateway startup, before the server begins accepting sessions.
/// </summary>
public sealed class OrphanWorkerCleanupHostedService(
OrphanWorkerTerminator terminator,
ILogger<OrphanWorkerCleanupHostedService> logger) : IHostedService
{
/// <inheritdoc />
public Task StartAsync(CancellationToken cancellationToken)
{
try
{
terminator.TerminateOrphans();
}
catch (Exception exception)
{
// Orphan cleanup is best-effort; a failure here must not prevent
// the gateway from starting.
logger.LogWarning(exception, "Orphan worker cleanup failed on startup.");
}
return Task.CompletedTask;
}
/// <inheritdoc />
public Task StopAsync(CancellationToken cancellationToken) => Task.CompletedTask;
}
@@ -0,0 +1,138 @@
using Microsoft.Extensions.Logging.Abstractions;
using Microsoft.Extensions.Options;
using MxGateway.Server.Configuration;
using MxGateway.Server.Metrics;
namespace MxGateway.Server.Workers;
/// <summary>
/// Terminates leftover MXAccess worker processes on gateway startup.
/// <para>
/// Per <c>gateway.md</c> ("first version should terminate orphaned workers
/// on startup") and CLAUDE.md, a gateway restart does not reattach old
/// workers. After an unclean gateway crash, x86 worker processes — each
/// holding an MXAccess COM instance on an STA — survive indefinitely. This
/// terminator finds those processes by executable name/path and kills them
/// before the restarted gateway accepts sessions.
/// </para>
/// </summary>
public sealed class OrphanWorkerTerminator
{
private readonly IRunningProcessInspector _inspector;
private readonly GatewayMetrics _metrics;
private readonly WorkerOptions _workerOptions;
private readonly ILogger<OrphanWorkerTerminator> _logger;
/// <summary>Initializes a new instance of the <see cref="OrphanWorkerTerminator"/> class.</summary>
/// <param name="gatewayOptions">Gateway configuration options.</param>
/// <param name="inspector">Running-process inspector.</param>
/// <param name="metrics">Gateway metrics collector.</param>
/// <param name="logger">Optional logger for diagnostic output.</param>
public OrphanWorkerTerminator(
IOptions<GatewayOptions> gatewayOptions,
IRunningProcessInspector inspector,
GatewayMetrics metrics,
ILogger<OrphanWorkerTerminator>? logger = null)
{
ArgumentNullException.ThrowIfNull(gatewayOptions);
_inspector = inspector ?? throw new ArgumentNullException(nameof(inspector));
_metrics = metrics ?? throw new ArgumentNullException(nameof(metrics));
_workerOptions = gatewayOptions.Value.Worker;
_logger = logger ?? NullLogger<OrphanWorkerTerminator>.Instance;
}
/// <summary>
/// Finds and kills every leftover worker process. Safe to call once at
/// startup before any session-owned worker is launched.
/// </summary>
/// <returns>The number of orphan worker processes that were terminated.</returns>
public int TerminateOrphans()
{
string? configuredPath = ResolveConfiguredExecutablePath();
string processName = ResolveProcessName(configuredPath);
int currentProcessId = Environment.ProcessId;
int terminated = 0;
foreach (RunningProcessInfo candidate in _inspector.GetProcessesByName(processName))
{
if (candidate.ProcessId == currentProcessId)
{
continue;
}
if (!IsOrphanWorker(candidate, configuredPath))
{
continue;
}
try
{
_inspector.Kill(candidate.ProcessId);
_metrics.WorkerKilled("OrphanStartupCleanup");
terminated++;
_logger.LogWarning(
"Terminated orphan worker process {ProcessId} ({ExecutablePath}) left over from a previous gateway run.",
candidate.ProcessId,
candidate.ExecutablePath ?? processName);
}
catch (Exception exception)
{
// The process may have already exited, or be inaccessible.
// A failure to kill one orphan must not block gateway startup.
_logger.LogWarning(
exception,
"Failed to terminate orphan worker process {ProcessId}.",
candidate.ProcessId);
}
}
if (terminated > 0)
{
_logger.LogInformation("Terminated {Count} orphan worker process(es) on startup.", terminated);
}
return terminated;
}
private static bool IsOrphanWorker(RunningProcessInfo candidate, string? configuredPath)
{
// When the executable path is readable, require an exact match against
// the configured worker path so unrelated processes that merely share
// the image name are never killed.
if (candidate.ExecutablePath is { } path)
{
return configuredPath is not null
&& string.Equals(path, configuredPath, StringComparison.OrdinalIgnoreCase);
}
// A null path means the x64 gateway could not introspect the module —
// the expected case for the x86 worker. Image-name match is the only
// signal available; treat it as an orphan.
return true;
}
private string? ResolveConfiguredExecutablePath()
{
try
{
return Path.GetFullPath(_workerOptions.ExecutablePath);
}
catch (Exception exception) when (exception is ArgumentException
or NotSupportedException
or PathTooLongException)
{
_logger.LogWarning(
exception,
"Configured worker executable path '{ExecutablePath}' is not a valid filesystem path; "
+ "orphan cleanup will match by image name only.",
_workerOptions.ExecutablePath);
return null;
}
}
private static string ResolveProcessName(string? configuredPath)
{
string source = configuredPath ?? "MxGateway.Worker.exe";
return Path.GetFileNameWithoutExtension(source);
}
}
@@ -0,0 +1,55 @@
using System.Diagnostics;
namespace MxGateway.Server.Workers;
/// <summary>
/// <see cref="IRunningProcessInspector"/> backed by <see cref="Process"/>.
/// </summary>
public sealed class SystemRunningProcessInspector : IRunningProcessInspector
{
/// <inheritdoc />
public IReadOnlyList<RunningProcessInfo> GetProcessesByName(string processName)
{
List<RunningProcessInfo> results = [];
Process[] processes = Process.GetProcessesByName(processName);
try
{
foreach (Process process in processes)
{
results.Add(new RunningProcessInfo(process.Id, TryGetExecutablePath(process)));
}
}
finally
{
foreach (Process process in processes)
{
process.Dispose();
}
}
return results;
}
/// <inheritdoc />
public void Kill(int processId)
{
using Process process = Process.GetProcessById(processId);
process.Kill(entireProcessTree: true);
}
private static string? TryGetExecutablePath(Process process)
{
try
{
return process.MainModule?.FileName;
}
catch (Exception exception) when (exception is InvalidOperationException
or System.ComponentModel.Win32Exception
or NotSupportedException)
{
// Access to the main module can be denied (e.g. a 64-bit gateway
// querying a 32-bit worker, or a process owned by another user).
return null;
}
}
}
@@ -11,6 +11,13 @@ public static class WorkerServiceCollectionExtensions
services.AddSingleton<IWorkerStartupProbe, WorkerProcessStartedProbe>();
services.AddSingleton<IWorkerProcessLauncher, WorkerProcessLauncher>();
// Terminate workers leaked by a previous unclean gateway run before the
// server accepts sessions. Registered ahead of AddGatewaySessions so the
// cleanup hosted service starts before the session subsystem.
services.AddSingleton<IRunningProcessInspector, SystemRunningProcessInspector>();
services.AddSingleton<OrphanWorkerTerminator>();
services.AddHostedService<OrphanWorkerCleanupHostedService>();
return services;
}
}