1d9e3afadd
Server-002: the gateway never terminated leftover MxGateway.Worker.exe processes at startup, contradicting gateway.md and CLAUDE.md. Added IRunningProcessInspector/SystemRunningProcessInspector, OrphanWorkerTerminator, and OrphanWorkerCleanupHostedService (best-effort, runs before sessions are accepted); updated gateway.md to describe the implemented behavior. Server-004: API-key scopes were persisted verbatim with no validation. Added GatewayScopes.All/IsKnown; the CLI parser and dashboard create path now reject unknown scope strings. Server-005: a non-SqlException/InvalidOperationException fault on the initial Galaxy hierarchy load faulted the BackgroundService. ExecuteAsync now catches all non-cancellation exceptions on first load and RefreshCoreAsync broadens its catch so the cache records Stale/Unavailable instead. Server-006: OpenSessionAsync incremented the open-sessions gauge before alarm auto-subscribe; an auto-subscribe failure leaked the gauge. The catch path now calls SessionRemoved() when the gauge was incremented. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
139 lines
5.4 KiB
C#
139 lines
5.4 KiB
C#
using Microsoft.Extensions.Logging.Abstractions;
|
|
using Microsoft.Extensions.Options;
|
|
using MxGateway.Server.Configuration;
|
|
using MxGateway.Server.Metrics;
|
|
|
|
namespace MxGateway.Server.Workers;
|
|
|
|
/// <summary>
|
|
/// Terminates leftover MXAccess worker processes on gateway startup.
|
|
/// <para>
|
|
/// Per <c>gateway.md</c> ("first version should terminate orphaned workers
|
|
/// on startup") and CLAUDE.md, a gateway restart does not reattach old
|
|
/// workers. After an unclean gateway crash, x86 worker processes — each
|
|
/// holding an MXAccess COM instance on an STA — survive indefinitely. This
|
|
/// terminator finds those processes by executable name/path and kills them
|
|
/// before the restarted gateway accepts sessions.
|
|
/// </para>
|
|
/// </summary>
|
|
public sealed class OrphanWorkerTerminator
|
|
{
|
|
private readonly IRunningProcessInspector _inspector;
|
|
private readonly GatewayMetrics _metrics;
|
|
private readonly WorkerOptions _workerOptions;
|
|
private readonly ILogger<OrphanWorkerTerminator> _logger;
|
|
|
|
/// <summary>Initializes a new instance of the <see cref="OrphanWorkerTerminator"/> class.</summary>
|
|
/// <param name="gatewayOptions">Gateway configuration options.</param>
|
|
/// <param name="inspector">Running-process inspector.</param>
|
|
/// <param name="metrics">Gateway metrics collector.</param>
|
|
/// <param name="logger">Optional logger for diagnostic output.</param>
|
|
public OrphanWorkerTerminator(
|
|
IOptions<GatewayOptions> gatewayOptions,
|
|
IRunningProcessInspector inspector,
|
|
GatewayMetrics metrics,
|
|
ILogger<OrphanWorkerTerminator>? logger = null)
|
|
{
|
|
ArgumentNullException.ThrowIfNull(gatewayOptions);
|
|
_inspector = inspector ?? throw new ArgumentNullException(nameof(inspector));
|
|
_metrics = metrics ?? throw new ArgumentNullException(nameof(metrics));
|
|
_workerOptions = gatewayOptions.Value.Worker;
|
|
_logger = logger ?? NullLogger<OrphanWorkerTerminator>.Instance;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Finds and kills every leftover worker process. Safe to call once at
|
|
/// startup before any session-owned worker is launched.
|
|
/// </summary>
|
|
/// <returns>The number of orphan worker processes that were terminated.</returns>
|
|
public int TerminateOrphans()
|
|
{
|
|
string? configuredPath = ResolveConfiguredExecutablePath();
|
|
string processName = ResolveProcessName(configuredPath);
|
|
int currentProcessId = Environment.ProcessId;
|
|
|
|
int terminated = 0;
|
|
foreach (RunningProcessInfo candidate in _inspector.GetProcessesByName(processName))
|
|
{
|
|
if (candidate.ProcessId == currentProcessId)
|
|
{
|
|
continue;
|
|
}
|
|
|
|
if (!IsOrphanWorker(candidate, configuredPath))
|
|
{
|
|
continue;
|
|
}
|
|
|
|
try
|
|
{
|
|
_inspector.Kill(candidate.ProcessId);
|
|
_metrics.WorkerKilled("OrphanStartupCleanup");
|
|
terminated++;
|
|
_logger.LogWarning(
|
|
"Terminated orphan worker process {ProcessId} ({ExecutablePath}) left over from a previous gateway run.",
|
|
candidate.ProcessId,
|
|
candidate.ExecutablePath ?? processName);
|
|
}
|
|
catch (Exception exception)
|
|
{
|
|
// The process may have already exited, or be inaccessible.
|
|
// A failure to kill one orphan must not block gateway startup.
|
|
_logger.LogWarning(
|
|
exception,
|
|
"Failed to terminate orphan worker process {ProcessId}.",
|
|
candidate.ProcessId);
|
|
}
|
|
}
|
|
|
|
if (terminated > 0)
|
|
{
|
|
_logger.LogInformation("Terminated {Count} orphan worker process(es) on startup.", terminated);
|
|
}
|
|
|
|
return terminated;
|
|
}
|
|
|
|
private static bool IsOrphanWorker(RunningProcessInfo candidate, string? configuredPath)
|
|
{
|
|
// When the executable path is readable, require an exact match against
|
|
// the configured worker path so unrelated processes that merely share
|
|
// the image name are never killed.
|
|
if (candidate.ExecutablePath is { } path)
|
|
{
|
|
return configuredPath is not null
|
|
&& string.Equals(path, configuredPath, StringComparison.OrdinalIgnoreCase);
|
|
}
|
|
|
|
// A null path means the x64 gateway could not introspect the module —
|
|
// the expected case for the x86 worker. Image-name match is the only
|
|
// signal available; treat it as an orphan.
|
|
return true;
|
|
}
|
|
|
|
private string? ResolveConfiguredExecutablePath()
|
|
{
|
|
try
|
|
{
|
|
return Path.GetFullPath(_workerOptions.ExecutablePath);
|
|
}
|
|
catch (Exception exception) when (exception is ArgumentException
|
|
or NotSupportedException
|
|
or PathTooLongException)
|
|
{
|
|
_logger.LogWarning(
|
|
exception,
|
|
"Configured worker executable path '{ExecutablePath}' is not a valid filesystem path; "
|
|
+ "orphan cleanup will match by image name only.",
|
|
_workerOptions.ExecutablePath);
|
|
return null;
|
|
}
|
|
}
|
|
|
|
private static string ResolveProcessName(string? configuredPath)
|
|
{
|
|
string source = configuredPath ?? "MxGateway.Worker.exe";
|
|
return Path.GetFileNameWithoutExtension(source);
|
|
}
|
|
}
|