Resolve Server-002, -004, -005, -006 code-review findings
Server-002: the gateway never terminated leftover MxGateway.Worker.exe processes at startup, contradicting gateway.md and CLAUDE.md. Added IRunningProcessInspector/SystemRunningProcessInspector, OrphanWorkerTerminator, and OrphanWorkerCleanupHostedService (best-effort, runs before sessions are accepted); updated gateway.md to describe the implemented behavior. Server-004: API-key scopes were persisted verbatim with no validation. Added GatewayScopes.All/IsKnown; the CLI parser and dashboard create path now reject unknown scope strings. Server-005: a non-SqlException/InvalidOperationException fault on the initial Galaxy hierarchy load faulted the BackgroundService. ExecuteAsync now catches all non-cancellation exceptions on first load and RefreshCoreAsync broadens its catch so the cache records Stale/Unavailable instead. Server-006: OpenSessionAsync incremented the open-sessions gauge before alarm auto-subscribe; an auto-subscribe failure leaked the gauge. The catch path now calls SessionRemoved() when the gauge was incremented. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1,6 +1,7 @@
|
||||
using System.Security.Claims;
|
||||
using Microsoft.Data.Sqlite;
|
||||
using MxGateway.Server.Security.Authentication;
|
||||
using MxGateway.Server.Security.Authorization;
|
||||
|
||||
namespace MxGateway.Server.Dashboard;
|
||||
|
||||
@@ -171,6 +172,15 @@ public sealed class DashboardApiKeyManagementService(
|
||||
return "Display name is required.";
|
||||
}
|
||||
|
||||
string[] unknownScopes = request.Scopes
|
||||
.Where(scope => !GatewayScopes.IsKnown(scope))
|
||||
.ToArray();
|
||||
if (unknownScopes.Length > 0)
|
||||
{
|
||||
return $"Unknown scope(s): {string.Join(", ", unknownScopes)}. "
|
||||
+ $"Valid scopes are: {string.Join(", ", GatewayScopes.All)}.";
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
using Google.Protobuf.WellKnownTypes;
|
||||
using Microsoft.Data.SqlClient;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using MxGateway.Contracts.Proto.Galaxy;
|
||||
using MxGateway.Server.Dashboard;
|
||||
@@ -181,8 +180,13 @@ public sealed class GalaxyHierarchyCache : IGalaxyHierarchyCache
|
||||
{
|
||||
throw;
|
||||
}
|
||||
catch (Exception exception) when (exception is SqlException or InvalidOperationException)
|
||||
catch (Exception exception)
|
||||
{
|
||||
// Catch every non-cancellation failure — not just SqlException /
|
||||
// InvalidOperationException. A TimeoutException or Win32Exception
|
||||
// from connection establishment, or another DbException subtype,
|
||||
// must still degrade gracefully to Stale/Unavailable and complete
|
||||
// _firstLoad rather than escape and fault the refresh BackgroundService.
|
||||
_logger?.LogWarning(exception, "Galaxy hierarchy cache refresh failed.");
|
||||
GalaxyHierarchyCacheEntry failed = previous with
|
||||
{
|
||||
|
||||
@@ -26,6 +26,15 @@ public sealed class GalaxyHierarchyRefreshService(
|
||||
{
|
||||
return;
|
||||
}
|
||||
catch (Exception exception)
|
||||
{
|
||||
// A transient first-load failure (e.g. a TimeoutException or
|
||||
// Win32Exception from connection establishment, or a DbException
|
||||
// subtype the cache does not catch) must not fault this
|
||||
// BackgroundService and stop the whole gateway. The cache records
|
||||
// its own Unavailable/Stale status; the periodic tick below retries.
|
||||
logger.LogWarning(exception, "Initial Galaxy hierarchy cache load failed; will retry on the refresh interval.");
|
||||
}
|
||||
|
||||
using PeriodicTimer timer = new(interval, _timeProvider);
|
||||
try
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
using MxGateway.Server.Security.Authorization;
|
||||
|
||||
namespace MxGateway.Server.Security.Authentication;
|
||||
|
||||
public static class ApiKeyAdminCommandLineParser
|
||||
@@ -95,6 +97,12 @@ public static class ApiKeyAdminCommandLineParser
|
||||
return ApiKeyAdminParseResult.Fail(validationError);
|
||||
}
|
||||
|
||||
string? scopeError = ValidateScopes(kind, scopes);
|
||||
if (scopeError is not null)
|
||||
{
|
||||
return ApiKeyAdminParseResult.Fail(scopeError);
|
||||
}
|
||||
|
||||
return ApiKeyAdminParseResult.Success(new ApiKeyAdminCommand(
|
||||
Kind: kind,
|
||||
Json: json,
|
||||
@@ -152,6 +160,23 @@ public static class ApiKeyAdminCommandLineParser
|
||||
return null;
|
||||
}
|
||||
|
||||
private static string? ValidateScopes(ApiKeyAdminCommandKind kind, IReadOnlySet<string> scopes)
|
||||
{
|
||||
if (kind != ApiKeyAdminCommandKind.CreateKey)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
string[] unknown = scopes.Where(scope => !GatewayScopes.IsKnown(scope)).ToArray();
|
||||
if (unknown.Length == 0)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
return $"Unknown scope(s): {string.Join(", ", unknown)}. "
|
||||
+ $"Valid scopes are: {string.Join(", ", GatewayScopes.All)}.";
|
||||
}
|
||||
|
||||
private static string KindName(ApiKeyAdminCommandKind kind)
|
||||
{
|
||||
return kind switch
|
||||
|
||||
@@ -10,4 +10,28 @@ public static class GatewayScopes
|
||||
public const string EventsRead = "events:read";
|
||||
public const string MetadataRead = "metadata:read";
|
||||
public const string Admin = "admin";
|
||||
|
||||
/// <summary>
|
||||
/// The complete catalog of canonical scope strings the gateway authorization
|
||||
/// resolver recognizes. Key-creation paths (CLI and dashboard) validate requested
|
||||
/// scopes against this set so a typo or non-canonical name cannot persist a key
|
||||
/// whose scope strings the resolver never matches.
|
||||
/// </summary>
|
||||
public static readonly IReadOnlySet<string> All = new HashSet<string>(
|
||||
[
|
||||
SessionOpen,
|
||||
SessionClose,
|
||||
InvokeRead,
|
||||
InvokeWrite,
|
||||
InvokeSecure,
|
||||
EventsRead,
|
||||
MetadataRead,
|
||||
Admin,
|
||||
],
|
||||
System.StringComparer.Ordinal);
|
||||
|
||||
/// <summary>Determines whether the supplied scope string is a recognized canonical scope.</summary>
|
||||
/// <param name="scope">Scope string to check.</param>
|
||||
/// <returns><see langword="true"/> when the scope is canonical; otherwise <see langword="false"/>.</returns>
|
||||
public static bool IsKnown(string scope) => All.Contains(scope);
|
||||
}
|
||||
|
||||
@@ -68,6 +68,7 @@ public sealed class SessionManager : ISessionManager
|
||||
EnsureSessionCapacity();
|
||||
|
||||
GatewaySession? session = null;
|
||||
bool sessionOpenedRecorded = false;
|
||||
try
|
||||
{
|
||||
session = CreateSession(request, clientIdentity);
|
||||
@@ -86,6 +87,7 @@ public sealed class SessionManager : ISessionManager
|
||||
session.AttachWorkerClient(workerClient);
|
||||
session.MarkReady();
|
||||
_metrics.SessionOpened();
|
||||
sessionOpenedRecorded = true;
|
||||
|
||||
await TryAutoSubscribeAlarmsAsync(session, cancellationToken).ConfigureAwait(false);
|
||||
|
||||
@@ -100,6 +102,14 @@ public sealed class SessionManager : ISessionManager
|
||||
await session.DisposeAsync().ConfigureAwait(false);
|
||||
}
|
||||
|
||||
// If SessionOpened() already incremented the open-session gauge,
|
||||
// a failure after that point (e.g. auto-subscribe rejection) must
|
||||
// decrement it again so mxgateway.sessions.open does not leak.
|
||||
if (sessionOpenedRecorded)
|
||||
{
|
||||
_metrics.SessionRemoved();
|
||||
}
|
||||
|
||||
ReleaseSessionSlot();
|
||||
_metrics.Fault(SessionManagerErrorCode.OpenFailed.ToString());
|
||||
_logger.LogWarning(
|
||||
|
||||
@@ -0,0 +1,29 @@
|
||||
namespace MxGateway.Server.Workers;
|
||||
|
||||
/// <summary>
|
||||
/// Abstraction over OS process enumeration and termination. Exists so the
|
||||
/// orphan-worker cleanup logic can be unit-tested without spawning real
|
||||
/// processes.
|
||||
/// </summary>
|
||||
public interface IRunningProcessInspector
|
||||
{
|
||||
/// <summary>
|
||||
/// Enumerates currently running processes whose image name (without the
|
||||
/// <c>.exe</c> extension) matches <paramref name="processName"/>.
|
||||
/// </summary>
|
||||
/// <param name="processName">Process image name to match, without extension.</param>
|
||||
/// <returns>The matching running processes.</returns>
|
||||
IReadOnlyList<RunningProcessInfo> GetProcessesByName(string processName);
|
||||
|
||||
/// <summary>Forcibly terminates the process with the given identifier.</summary>
|
||||
/// <param name="processId">Identifier of the process to terminate.</param>
|
||||
void Kill(int processId);
|
||||
}
|
||||
|
||||
/// <summary>Identifying information for a running process candidate.</summary>
|
||||
/// <param name="ProcessId">Operating-system process identifier.</param>
|
||||
/// <param name="ExecutablePath">
|
||||
/// Fully-qualified path to the process main module, or <see langword="null"/>
|
||||
/// when it could not be read (e.g. access denied).
|
||||
/// </param>
|
||||
public sealed record RunningProcessInfo(int ProcessId, string? ExecutablePath);
|
||||
@@ -0,0 +1,30 @@
|
||||
namespace MxGateway.Server.Workers;
|
||||
|
||||
/// <summary>
|
||||
/// Hosted service that terminates leftover MXAccess worker processes once on
|
||||
/// gateway startup, before the server begins accepting sessions.
|
||||
/// </summary>
|
||||
public sealed class OrphanWorkerCleanupHostedService(
|
||||
OrphanWorkerTerminator terminator,
|
||||
ILogger<OrphanWorkerCleanupHostedService> logger) : IHostedService
|
||||
{
|
||||
/// <inheritdoc />
|
||||
public Task StartAsync(CancellationToken cancellationToken)
|
||||
{
|
||||
try
|
||||
{
|
||||
terminator.TerminateOrphans();
|
||||
}
|
||||
catch (Exception exception)
|
||||
{
|
||||
// Orphan cleanup is best-effort; a failure here must not prevent
|
||||
// the gateway from starting.
|
||||
logger.LogWarning(exception, "Orphan worker cleanup failed on startup.");
|
||||
}
|
||||
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public Task StopAsync(CancellationToken cancellationToken) => Task.CompletedTask;
|
||||
}
|
||||
@@ -0,0 +1,138 @@
|
||||
using Microsoft.Extensions.Logging.Abstractions;
|
||||
using Microsoft.Extensions.Options;
|
||||
using MxGateway.Server.Configuration;
|
||||
using MxGateway.Server.Metrics;
|
||||
|
||||
namespace MxGateway.Server.Workers;
|
||||
|
||||
/// <summary>
|
||||
/// Terminates leftover MXAccess worker processes on gateway startup.
|
||||
/// <para>
|
||||
/// Per <c>gateway.md</c> ("first version should terminate orphaned workers
|
||||
/// on startup") and CLAUDE.md, a gateway restart does not reattach old
|
||||
/// workers. After an unclean gateway crash, x86 worker processes — each
|
||||
/// holding an MXAccess COM instance on an STA — survive indefinitely. This
|
||||
/// terminator finds those processes by executable name/path and kills them
|
||||
/// before the restarted gateway accepts sessions.
|
||||
/// </para>
|
||||
/// </summary>
|
||||
public sealed class OrphanWorkerTerminator
|
||||
{
|
||||
private readonly IRunningProcessInspector _inspector;
|
||||
private readonly GatewayMetrics _metrics;
|
||||
private readonly WorkerOptions _workerOptions;
|
||||
private readonly ILogger<OrphanWorkerTerminator> _logger;
|
||||
|
||||
/// <summary>Initializes a new instance of the <see cref="OrphanWorkerTerminator"/> class.</summary>
|
||||
/// <param name="gatewayOptions">Gateway configuration options.</param>
|
||||
/// <param name="inspector">Running-process inspector.</param>
|
||||
/// <param name="metrics">Gateway metrics collector.</param>
|
||||
/// <param name="logger">Optional logger for diagnostic output.</param>
|
||||
public OrphanWorkerTerminator(
|
||||
IOptions<GatewayOptions> gatewayOptions,
|
||||
IRunningProcessInspector inspector,
|
||||
GatewayMetrics metrics,
|
||||
ILogger<OrphanWorkerTerminator>? logger = null)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(gatewayOptions);
|
||||
_inspector = inspector ?? throw new ArgumentNullException(nameof(inspector));
|
||||
_metrics = metrics ?? throw new ArgumentNullException(nameof(metrics));
|
||||
_workerOptions = gatewayOptions.Value.Worker;
|
||||
_logger = logger ?? NullLogger<OrphanWorkerTerminator>.Instance;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Finds and kills every leftover worker process. Safe to call once at
|
||||
/// startup before any session-owned worker is launched.
|
||||
/// </summary>
|
||||
/// <returns>The number of orphan worker processes that were terminated.</returns>
|
||||
public int TerminateOrphans()
|
||||
{
|
||||
string? configuredPath = ResolveConfiguredExecutablePath();
|
||||
string processName = ResolveProcessName(configuredPath);
|
||||
int currentProcessId = Environment.ProcessId;
|
||||
|
||||
int terminated = 0;
|
||||
foreach (RunningProcessInfo candidate in _inspector.GetProcessesByName(processName))
|
||||
{
|
||||
if (candidate.ProcessId == currentProcessId)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!IsOrphanWorker(candidate, configuredPath))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
_inspector.Kill(candidate.ProcessId);
|
||||
_metrics.WorkerKilled("OrphanStartupCleanup");
|
||||
terminated++;
|
||||
_logger.LogWarning(
|
||||
"Terminated orphan worker process {ProcessId} ({ExecutablePath}) left over from a previous gateway run.",
|
||||
candidate.ProcessId,
|
||||
candidate.ExecutablePath ?? processName);
|
||||
}
|
||||
catch (Exception exception)
|
||||
{
|
||||
// The process may have already exited, or be inaccessible.
|
||||
// A failure to kill one orphan must not block gateway startup.
|
||||
_logger.LogWarning(
|
||||
exception,
|
||||
"Failed to terminate orphan worker process {ProcessId}.",
|
||||
candidate.ProcessId);
|
||||
}
|
||||
}
|
||||
|
||||
if (terminated > 0)
|
||||
{
|
||||
_logger.LogInformation("Terminated {Count} orphan worker process(es) on startup.", terminated);
|
||||
}
|
||||
|
||||
return terminated;
|
||||
}
|
||||
|
||||
private static bool IsOrphanWorker(RunningProcessInfo candidate, string? configuredPath)
|
||||
{
|
||||
// When the executable path is readable, require an exact match against
|
||||
// the configured worker path so unrelated processes that merely share
|
||||
// the image name are never killed.
|
||||
if (candidate.ExecutablePath is { } path)
|
||||
{
|
||||
return configuredPath is not null
|
||||
&& string.Equals(path, configuredPath, StringComparison.OrdinalIgnoreCase);
|
||||
}
|
||||
|
||||
// A null path means the x64 gateway could not introspect the module —
|
||||
// the expected case for the x86 worker. Image-name match is the only
|
||||
// signal available; treat it as an orphan.
|
||||
return true;
|
||||
}
|
||||
|
||||
private string? ResolveConfiguredExecutablePath()
|
||||
{
|
||||
try
|
||||
{
|
||||
return Path.GetFullPath(_workerOptions.ExecutablePath);
|
||||
}
|
||||
catch (Exception exception) when (exception is ArgumentException
|
||||
or NotSupportedException
|
||||
or PathTooLongException)
|
||||
{
|
||||
_logger.LogWarning(
|
||||
exception,
|
||||
"Configured worker executable path '{ExecutablePath}' is not a valid filesystem path; "
|
||||
+ "orphan cleanup will match by image name only.",
|
||||
_workerOptions.ExecutablePath);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private static string ResolveProcessName(string? configuredPath)
|
||||
{
|
||||
string source = configuredPath ?? "MxGateway.Worker.exe";
|
||||
return Path.GetFileNameWithoutExtension(source);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,55 @@
|
||||
using System.Diagnostics;
|
||||
|
||||
namespace MxGateway.Server.Workers;
|
||||
|
||||
/// <summary>
|
||||
/// <see cref="IRunningProcessInspector"/> backed by <see cref="Process"/>.
|
||||
/// </summary>
|
||||
public sealed class SystemRunningProcessInspector : IRunningProcessInspector
|
||||
{
|
||||
/// <inheritdoc />
|
||||
public IReadOnlyList<RunningProcessInfo> GetProcessesByName(string processName)
|
||||
{
|
||||
List<RunningProcessInfo> results = [];
|
||||
Process[] processes = Process.GetProcessesByName(processName);
|
||||
try
|
||||
{
|
||||
foreach (Process process in processes)
|
||||
{
|
||||
results.Add(new RunningProcessInfo(process.Id, TryGetExecutablePath(process)));
|
||||
}
|
||||
}
|
||||
finally
|
||||
{
|
||||
foreach (Process process in processes)
|
||||
{
|
||||
process.Dispose();
|
||||
}
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public void Kill(int processId)
|
||||
{
|
||||
using Process process = Process.GetProcessById(processId);
|
||||
process.Kill(entireProcessTree: true);
|
||||
}
|
||||
|
||||
private static string? TryGetExecutablePath(Process process)
|
||||
{
|
||||
try
|
||||
{
|
||||
return process.MainModule?.FileName;
|
||||
}
|
||||
catch (Exception exception) when (exception is InvalidOperationException
|
||||
or System.ComponentModel.Win32Exception
|
||||
or NotSupportedException)
|
||||
{
|
||||
// Access to the main module can be denied (e.g. a 64-bit gateway
|
||||
// querying a 32-bit worker, or a process owned by another user).
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -11,6 +11,13 @@ public static class WorkerServiceCollectionExtensions
|
||||
services.AddSingleton<IWorkerStartupProbe, WorkerProcessStartedProbe>();
|
||||
services.AddSingleton<IWorkerProcessLauncher, WorkerProcessLauncher>();
|
||||
|
||||
// Terminate workers leaked by a previous unclean gateway run before the
|
||||
// server accepts sessions. Registered ahead of AddGatewaySessions so the
|
||||
// cleanup hosted service starts before the session subsystem.
|
||||
services.AddSingleton<IRunningProcessInspector, SystemRunningProcessInspector>();
|
||||
services.AddSingleton<OrphanWorkerTerminator>();
|
||||
services.AddHostedService<OrphanWorkerCleanupHostedService>();
|
||||
|
||||
return services;
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user