Resolve Server-002, -004, -005, -006 code-review findings
Server-002: the gateway never terminated leftover MxGateway.Worker.exe processes at startup, contradicting gateway.md and CLAUDE.md. Added IRunningProcessInspector/SystemRunningProcessInspector, OrphanWorkerTerminator, and OrphanWorkerCleanupHostedService (best-effort, runs before sessions are accepted); updated gateway.md to describe the implemented behavior. Server-004: API-key scopes were persisted verbatim with no validation. Added GatewayScopes.All/IsKnown; the CLI parser and dashboard create path now reject unknown scope strings. Server-005: a non-SqlException/InvalidOperationException fault on the initial Galaxy hierarchy load faulted the BackgroundService. ExecuteAsync now catches all non-cancellation exceptions on first load and RefreshCoreAsync broadens its catch so the cache records Stale/Unavailable instead. Server-006: OpenSessionAsync incremented the open-sessions gauge before alarm auto-subscribe; an auto-subscribe failure leaked the gauge. The catch path now calls SessionRemoved() when the gauge was incremented. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,138 @@
|
||||
using Microsoft.Extensions.Logging.Abstractions;
|
||||
using Microsoft.Extensions.Options;
|
||||
using MxGateway.Server.Configuration;
|
||||
using MxGateway.Server.Metrics;
|
||||
|
||||
namespace MxGateway.Server.Workers;
|
||||
|
||||
/// <summary>
|
||||
/// Terminates leftover MXAccess worker processes on gateway startup.
|
||||
/// <para>
|
||||
/// Per <c>gateway.md</c> ("first version should terminate orphaned workers
|
||||
/// on startup") and CLAUDE.md, a gateway restart does not reattach old
|
||||
/// workers. After an unclean gateway crash, x86 worker processes — each
|
||||
/// holding an MXAccess COM instance on an STA — survive indefinitely. This
|
||||
/// terminator finds those processes by executable name/path and kills them
|
||||
/// before the restarted gateway accepts sessions.
|
||||
/// </para>
|
||||
/// </summary>
|
||||
public sealed class OrphanWorkerTerminator
|
||||
{
|
||||
private readonly IRunningProcessInspector _inspector;
|
||||
private readonly GatewayMetrics _metrics;
|
||||
private readonly WorkerOptions _workerOptions;
|
||||
private readonly ILogger<OrphanWorkerTerminator> _logger;
|
||||
|
||||
/// <summary>Initializes a new instance of the <see cref="OrphanWorkerTerminator"/> class.</summary>
|
||||
/// <param name="gatewayOptions">Gateway configuration options.</param>
|
||||
/// <param name="inspector">Running-process inspector.</param>
|
||||
/// <param name="metrics">Gateway metrics collector.</param>
|
||||
/// <param name="logger">Optional logger for diagnostic output.</param>
|
||||
public OrphanWorkerTerminator(
|
||||
IOptions<GatewayOptions> gatewayOptions,
|
||||
IRunningProcessInspector inspector,
|
||||
GatewayMetrics metrics,
|
||||
ILogger<OrphanWorkerTerminator>? logger = null)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(gatewayOptions);
|
||||
_inspector = inspector ?? throw new ArgumentNullException(nameof(inspector));
|
||||
_metrics = metrics ?? throw new ArgumentNullException(nameof(metrics));
|
||||
_workerOptions = gatewayOptions.Value.Worker;
|
||||
_logger = logger ?? NullLogger<OrphanWorkerTerminator>.Instance;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Finds and kills every leftover worker process. Safe to call once at
|
||||
/// startup before any session-owned worker is launched.
|
||||
/// </summary>
|
||||
/// <returns>The number of orphan worker processes that were terminated.</returns>
|
||||
public int TerminateOrphans()
|
||||
{
|
||||
string? configuredPath = ResolveConfiguredExecutablePath();
|
||||
string processName = ResolveProcessName(configuredPath);
|
||||
int currentProcessId = Environment.ProcessId;
|
||||
|
||||
int terminated = 0;
|
||||
foreach (RunningProcessInfo candidate in _inspector.GetProcessesByName(processName))
|
||||
{
|
||||
if (candidate.ProcessId == currentProcessId)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!IsOrphanWorker(candidate, configuredPath))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
_inspector.Kill(candidate.ProcessId);
|
||||
_metrics.WorkerKilled("OrphanStartupCleanup");
|
||||
terminated++;
|
||||
_logger.LogWarning(
|
||||
"Terminated orphan worker process {ProcessId} ({ExecutablePath}) left over from a previous gateway run.",
|
||||
candidate.ProcessId,
|
||||
candidate.ExecutablePath ?? processName);
|
||||
}
|
||||
catch (Exception exception)
|
||||
{
|
||||
// The process may have already exited, or be inaccessible.
|
||||
// A failure to kill one orphan must not block gateway startup.
|
||||
_logger.LogWarning(
|
||||
exception,
|
||||
"Failed to terminate orphan worker process {ProcessId}.",
|
||||
candidate.ProcessId);
|
||||
}
|
||||
}
|
||||
|
||||
if (terminated > 0)
|
||||
{
|
||||
_logger.LogInformation("Terminated {Count} orphan worker process(es) on startup.", terminated);
|
||||
}
|
||||
|
||||
return terminated;
|
||||
}
|
||||
|
||||
private static bool IsOrphanWorker(RunningProcessInfo candidate, string? configuredPath)
|
||||
{
|
||||
// When the executable path is readable, require an exact match against
|
||||
// the configured worker path so unrelated processes that merely share
|
||||
// the image name are never killed.
|
||||
if (candidate.ExecutablePath is { } path)
|
||||
{
|
||||
return configuredPath is not null
|
||||
&& string.Equals(path, configuredPath, StringComparison.OrdinalIgnoreCase);
|
||||
}
|
||||
|
||||
// A null path means the x64 gateway could not introspect the module —
|
||||
// the expected case for the x86 worker. Image-name match is the only
|
||||
// signal available; treat it as an orphan.
|
||||
return true;
|
||||
}
|
||||
|
||||
private string? ResolveConfiguredExecutablePath()
|
||||
{
|
||||
try
|
||||
{
|
||||
return Path.GetFullPath(_workerOptions.ExecutablePath);
|
||||
}
|
||||
catch (Exception exception) when (exception is ArgumentException
|
||||
or NotSupportedException
|
||||
or PathTooLongException)
|
||||
{
|
||||
_logger.LogWarning(
|
||||
exception,
|
||||
"Configured worker executable path '{ExecutablePath}' is not a valid filesystem path; "
|
||||
+ "orphan cleanup will match by image name only.",
|
||||
_workerOptions.ExecutablePath);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private static string ResolveProcessName(string? configuredPath)
|
||||
{
|
||||
string source = configuredPath ?? "MxGateway.Worker.exe";
|
||||
return Path.GetFileNameWithoutExtension(source);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user