Files
mxaccessgw/src/MxGateway.Server/Workers/OrphanWorkerTerminator.cs
T
Joseph Doherty 1d9e3afadd Resolve Server-002, -004, -005, -006 code-review findings
Server-002: the gateway never terminated leftover MxGateway.Worker.exe
processes at startup, contradicting gateway.md and CLAUDE.md. Added
IRunningProcessInspector/SystemRunningProcessInspector, OrphanWorkerTerminator,
and OrphanWorkerCleanupHostedService (best-effort, runs before sessions are
accepted); updated gateway.md to describe the implemented behavior.

Server-004: API-key scopes were persisted verbatim with no validation. Added
GatewayScopes.All/IsKnown; the CLI parser and dashboard create path now
reject unknown scope strings.

Server-005: a non-SqlException/InvalidOperationException fault on the initial
Galaxy hierarchy load faulted the BackgroundService. ExecuteAsync now catches
all non-cancellation exceptions on first load and RefreshCoreAsync broadens
its catch so the cache records Stale/Unavailable instead.

Server-006: OpenSessionAsync incremented the open-sessions gauge before
alarm auto-subscribe; an auto-subscribe failure leaked the gauge. The catch
path now calls SessionRemoved() when the gauge was incremented.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-18 21:31:10 -04:00

139 lines
5.4 KiB
C#

using Microsoft.Extensions.Logging.Abstractions;
using Microsoft.Extensions.Options;
using MxGateway.Server.Configuration;
using MxGateway.Server.Metrics;
namespace MxGateway.Server.Workers;
/// <summary>
/// Terminates leftover MXAccess worker processes on gateway startup.
/// <para>
/// Per <c>gateway.md</c> ("first version should terminate orphaned workers
/// on startup") and CLAUDE.md, a gateway restart does not reattach old
/// workers. After an unclean gateway crash, x86 worker processes — each
/// holding an MXAccess COM instance on an STA — survive indefinitely. This
/// terminator finds those processes by executable name/path and kills them
/// before the restarted gateway accepts sessions.
/// </para>
/// </summary>
public sealed class OrphanWorkerTerminator
{
private readonly IRunningProcessInspector _inspector;
private readonly GatewayMetrics _metrics;
private readonly WorkerOptions _workerOptions;
private readonly ILogger<OrphanWorkerTerminator> _logger;
/// <summary>Initializes a new instance of the <see cref="OrphanWorkerTerminator"/> class.</summary>
/// <param name="gatewayOptions">Gateway configuration options.</param>
/// <param name="inspector">Running-process inspector.</param>
/// <param name="metrics">Gateway metrics collector.</param>
/// <param name="logger">Optional logger for diagnostic output.</param>
public OrphanWorkerTerminator(
IOptions<GatewayOptions> gatewayOptions,
IRunningProcessInspector inspector,
GatewayMetrics metrics,
ILogger<OrphanWorkerTerminator>? logger = null)
{
ArgumentNullException.ThrowIfNull(gatewayOptions);
_inspector = inspector ?? throw new ArgumentNullException(nameof(inspector));
_metrics = metrics ?? throw new ArgumentNullException(nameof(metrics));
_workerOptions = gatewayOptions.Value.Worker;
_logger = logger ?? NullLogger<OrphanWorkerTerminator>.Instance;
}
/// <summary>
/// Finds and kills every leftover worker process. Safe to call once at
/// startup before any session-owned worker is launched.
/// </summary>
/// <returns>The number of orphan worker processes that were terminated.</returns>
public int TerminateOrphans()
{
string? configuredPath = ResolveConfiguredExecutablePath();
string processName = ResolveProcessName(configuredPath);
int currentProcessId = Environment.ProcessId;
int terminated = 0;
foreach (RunningProcessInfo candidate in _inspector.GetProcessesByName(processName))
{
if (candidate.ProcessId == currentProcessId)
{
continue;
}
if (!IsOrphanWorker(candidate, configuredPath))
{
continue;
}
try
{
_inspector.Kill(candidate.ProcessId);
_metrics.WorkerKilled("OrphanStartupCleanup");
terminated++;
_logger.LogWarning(
"Terminated orphan worker process {ProcessId} ({ExecutablePath}) left over from a previous gateway run.",
candidate.ProcessId,
candidate.ExecutablePath ?? processName);
}
catch (Exception exception)
{
// The process may have already exited, or be inaccessible.
// A failure to kill one orphan must not block gateway startup.
_logger.LogWarning(
exception,
"Failed to terminate orphan worker process {ProcessId}.",
candidate.ProcessId);
}
}
if (terminated > 0)
{
_logger.LogInformation("Terminated {Count} orphan worker process(es) on startup.", terminated);
}
return terminated;
}
private static bool IsOrphanWorker(RunningProcessInfo candidate, string? configuredPath)
{
// When the executable path is readable, require an exact match against
// the configured worker path so unrelated processes that merely share
// the image name are never killed.
if (candidate.ExecutablePath is { } path)
{
return configuredPath is not null
&& string.Equals(path, configuredPath, StringComparison.OrdinalIgnoreCase);
}
// A null path means the x64 gateway could not introspect the module —
// the expected case for the x86 worker. Image-name match is the only
// signal available; treat it as an orphan.
return true;
}
private string? ResolveConfiguredExecutablePath()
{
try
{
return Path.GetFullPath(_workerOptions.ExecutablePath);
}
catch (Exception exception) when (exception is ArgumentException
or NotSupportedException
or PathTooLongException)
{
_logger.LogWarning(
exception,
"Configured worker executable path '{ExecutablePath}' is not a valid filesystem path; "
+ "orphan cleanup will match by image name only.",
_workerOptions.ExecutablePath);
return null;
}
}
private static string ResolveProcessName(string? configuredPath)
{
string source = configuredPath ?? "MxGateway.Worker.exe";
return Path.GetFileNameWithoutExtension(source);
}
}