Resolve Server-002, -004, -005, -006 code-review findings

Server-002: the gateway never terminated leftover MxGateway.Worker.exe
processes at startup, contradicting gateway.md and CLAUDE.md. Added
IRunningProcessInspector/SystemRunningProcessInspector, OrphanWorkerTerminator,
and OrphanWorkerCleanupHostedService (best-effort, runs before sessions are
accepted); updated gateway.md to describe the implemented behavior.

Server-004: API-key scopes were persisted verbatim with no validation. Added
GatewayScopes.All/IsKnown; the CLI parser and dashboard create path now
reject unknown scope strings.

Server-005: a non-SqlException/InvalidOperationException fault on the initial
Galaxy hierarchy load faulted the BackgroundService. ExecuteAsync now catches
all non-cancellation exceptions on first load and RefreshCoreAsync broadens
its catch so the cache records Stale/Unavailable instead.

Server-006: OpenSessionAsync incremented the open-sessions gauge before
alarm auto-subscribe; an auto-subscribe failure leaked the gauge. The catch
path now calls SessionRemoved() when the gauge was incremented.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Joseph Doherty
2026-05-18 21:31:10 -04:00
parent 5e795aeeb8
commit 1d9e3afadd
18 changed files with 676 additions and 15 deletions
@@ -0,0 +1,64 @@
using Microsoft.Extensions.Logging.Abstractions;
using Microsoft.Extensions.Options;
using MxGateway.Server.Galaxy;
namespace MxGateway.Tests.Galaxy;
/// <summary>
/// Server-005 regression: the initial <c>RefreshAsync</c> call in
/// <see cref="GalaxyHierarchyRefreshService"/> must not let a transient,
/// non-cancellation first-load failure (e.g. a <see cref="TimeoutException"/>
/// or <see cref="System.ComponentModel.Win32Exception"/> from connection
/// establishment) escape and fault the host <c>BackgroundService</c>.
/// </summary>
public sealed class GalaxyHierarchyRefreshServiceTests
{
[Fact]
public async Task ExecuteAsync_WhenFirstRefreshThrowsNonCancellationException_DoesNotFaultBackgroundService()
{
ThrowingCache cache = new(new TimeoutException("connection establishment timed out"));
GalaxyHierarchyRefreshService service = CreateService(cache);
using CancellationTokenSource cts = new();
await service.StartAsync(cts.Token);
await cts.CancelAsync();
// The background loop must have stopped cleanly: ExecuteTask completes
// (RanToCompletion or Canceled) rather than faulting on the first refresh.
Task? executeTask = service.ExecuteTask;
Assert.NotNull(executeTask);
await executeTask;
Assert.False(executeTask.IsFaulted);
Assert.Equal(1, cache.RefreshCallCount);
await service.StopAsync(CancellationToken.None);
}
private static GalaxyHierarchyRefreshService CreateService(IGalaxyHierarchyCache cache)
{
GalaxyRepositoryOptions options = new()
{
DashboardRefreshIntervalSeconds = 3600,
};
return new GalaxyHierarchyRefreshService(
cache,
Options.Create(options),
NullLogger<GalaxyHierarchyRefreshService>.Instance);
}
private sealed class ThrowingCache(Exception toThrow) : IGalaxyHierarchyCache
{
public int RefreshCallCount { get; private set; }
public GalaxyHierarchyCacheEntry Current => GalaxyHierarchyCacheEntry.Empty;
public Task RefreshAsync(CancellationToken cancellationToken)
{
RefreshCallCount++;
throw toThrow;
}
public Task WaitForFirstLoadAsync(CancellationToken cancellationToken) => Task.CompletedTask;
}
}
@@ -112,6 +112,33 @@ public sealed class DashboardApiKeyManagementServiceTests
&& entry.Details == "rotated");
}
/// <summary>
/// Server-004 regression: the dashboard create path must reject a request
/// carrying a non-canonical scope string rather than persisting a key whose
/// scope the authorization resolver never matches.
/// </summary>
[Fact]
public async Task CreateAsync_UnknownScope_DoesNotCallStore()
{
FakeApiKeyAdminStore adminStore = new();
DashboardApiKeyManagementService service = CreateService(adminStore);
DashboardApiKeyManagementRequest request = CreateRequest() with
{
Scopes = new HashSet<string>(
[GatewayScopes.SessionOpen, "invoke", "metadata"],
StringComparer.Ordinal),
};
DashboardApiKeyManagementResult result = await service.CreateAsync(
CreateAuthorizedUser(),
request,
CancellationToken.None);
Assert.False(result.Succeeded);
Assert.Equal(0, adminStore.CreateCount);
}
private static DashboardApiKeyManagementService CreateService(
FakeApiKeyAdminStore? adminStore = null,
FakeApiKeyAuditStore? auditStore = null,
@@ -125,6 +125,44 @@ public sealed class SessionManagerAlarmAutoSubscribeTests
CreateOpenRequest(), "client-1", CancellationToken.None));
}
/// <summary>
/// Server-006 regression: when auto-subscribe throws after
/// <c>SessionOpened()</c> incremented the open-session gauge, the failed
/// open must not leave <c>mxgateway.sessions.open</c> over-counted.
/// </summary>
[Fact]
public async Task OpenSessionAsync_DoesNotLeakOpenSessionGauge_WhenAutoSubscribeFailsWithRequireOn()
{
AlarmAutoSubscribeWorkerClient worker = new()
{
SubscribeAlarmsReplyFactory = _ => new MxCommandReply
{
Kind = MxCommandKind.SubscribeAlarms,
ProtocolStatus = new ProtocolStatus
{
Code = ProtocolStatusCode.MxaccessFailure,
Message = "wnwrap subscribe failed",
},
},
};
using GatewayMetrics metrics = new();
SessionManager manager = NewManager(
worker,
alarms: new AlarmsOptions
{
Enabled = true,
SubscriptionExpression = @"\\HOST\Galaxy!Area1",
RequireSubscribeOnOpen = true,
},
metrics: metrics);
await Assert.ThrowsAsync<SessionManagerException>(
async () => await manager.OpenSessionAsync(
CreateOpenRequest(), "client-1", CancellationToken.None));
Assert.Equal(0, metrics.GetSnapshot().OpenSessions);
}
[Fact]
public async Task OpenSessionAsync_Throws_WhenEnabledButNoExpressionAndRequireOn()
{
@@ -161,7 +199,8 @@ public sealed class SessionManagerAlarmAutoSubscribeTests
private static SessionManager NewManager(
AlarmAutoSubscribeWorkerClient worker,
AlarmsOptions alarms)
AlarmsOptions alarms,
GatewayMetrics? metrics = null)
{
FakeSessionWorkerClientFactory factory = new(worker);
GatewayOptions options = new GatewayOptions
@@ -183,7 +222,7 @@ public sealed class SessionManagerAlarmAutoSubscribeTests
new SessionRegistry(),
factory,
Options.Create(options),
new GatewayMetrics());
metrics ?? new GatewayMetrics());
}
private static SessionOpenRequest CreateOpenRequest()
@@ -0,0 +1,137 @@
using Microsoft.Extensions.Options;
using MxGateway.Server.Configuration;
using MxGateway.Server.Metrics;
using MxGateway.Server.Workers;
namespace MxGateway.Tests.Gateway.Workers;
/// <summary>
/// Server-002 regression: per <c>gateway.md</c> the gateway must terminate
/// orphaned worker processes on startup. These tests pin that the terminator
/// kills leftover workers (matched by executable path, or by image name when
/// the path is unreadable) without touching unrelated processes or itself.
/// </summary>
public sealed class OrphanWorkerTerminatorTests
{
private const string WorkerExecutablePath = @"C:\app\src\MxGateway.Worker\bin\x86\Release\MxGateway.Worker.exe";
[Fact]
public void TerminateOrphans_KillsWorkerProcessesMatchingConfiguredExecutablePath()
{
FakeProcessInspector inspector = new(
[
new RunningProcessInfo(101, WorkerExecutablePath),
new RunningProcessInfo(102, WorkerExecutablePath),
]);
OrphanWorkerTerminator terminator = CreateTerminator(inspector);
int killed = terminator.TerminateOrphans();
Assert.Equal(2, killed);
Assert.Equal([101, 102], inspector.KilledProcessIds.Order());
}
[Fact]
public void TerminateOrphans_KillsImageNameMatchWhenExecutablePathUnreadable()
{
// The x64 gateway cannot introspect the x86 worker's main module, so the
// path comes back null. Image-name match is the only signal — and it is
// exactly the orphan worker case, so the process must still be killed.
FakeProcessInspector inspector = new(
[
new RunningProcessInfo(201, ExecutablePath: null),
]);
OrphanWorkerTerminator terminator = CreateTerminator(inspector);
int killed = terminator.TerminateOrphans();
Assert.Equal(1, killed);
Assert.Equal([201], inspector.KilledProcessIds);
}
[Fact]
public void TerminateOrphans_DoesNotKillUnrelatedProcessSharingImageName()
{
// A process with the same image name but a different executable path is
// not our worker and must be left alone.
FakeProcessInspector inspector = new(
[
new RunningProcessInfo(301, @"C:\other\place\MxGateway.Worker.exe"),
]);
OrphanWorkerTerminator terminator = CreateTerminator(inspector);
int killed = terminator.TerminateOrphans();
Assert.Equal(0, killed);
Assert.Empty(inspector.KilledProcessIds);
}
[Fact]
public void TerminateOrphans_DoesNotKillCurrentProcess()
{
FakeProcessInspector inspector = new(
[
new RunningProcessInfo(Environment.ProcessId, WorkerExecutablePath),
]);
OrphanWorkerTerminator terminator = CreateTerminator(inspector);
int killed = terminator.TerminateOrphans();
Assert.Equal(0, killed);
Assert.Empty(inspector.KilledProcessIds);
}
[Fact]
public void TerminateOrphans_ContinuesWhenOneKillThrows()
{
FakeProcessInspector inspector = new(
[
new RunningProcessInfo(401, WorkerExecutablePath),
new RunningProcessInfo(402, WorkerExecutablePath),
])
{
ThrowOnKillProcessId = 401,
};
OrphanWorkerTerminator terminator = CreateTerminator(inspector);
int killed = terminator.TerminateOrphans();
Assert.Equal(1, killed);
Assert.Contains(402, inspector.KilledProcessIds);
}
private static OrphanWorkerTerminator CreateTerminator(IRunningProcessInspector inspector)
{
GatewayOptions options = new()
{
Worker = new WorkerOptions
{
ExecutablePath = WorkerExecutablePath,
},
};
return new OrphanWorkerTerminator(
Options.Create(options),
inspector,
new GatewayMetrics());
}
private sealed class FakeProcessInspector(IReadOnlyList<RunningProcessInfo> processes)
: IRunningProcessInspector
{
public List<int> KilledProcessIds { get; } = [];
public int? ThrowOnKillProcessId { get; init; }
public IReadOnlyList<RunningProcessInfo> GetProcessesByName(string processName) => processes;
public void Kill(int processId)
{
if (ThrowOnKillProcessId == processId)
{
throw new InvalidOperationException("Process has already exited.");
}
KilledProcessIds.Add(processId);
}
}
}
@@ -52,6 +52,56 @@ public sealed class ApiKeyAdminCommandLineParserTests
Assert.Contains("events:read", result.Command.Scopes);
}
/// <summary>
/// Server-004 regression: a create-key command with a non-canonical scope
/// string (e.g. CLAUDE.md's stale <c>invoke</c> instead of <c>invoke:read</c>)
/// must be rejected at parse time rather than silently persisting an
/// unusable scope the authorization resolver never matches.
/// </summary>
[Fact]
public void Parse_CreateKeyCommand_RejectsUnknownScope()
{
ApiKeyAdminParseResult result = ApiKeyAdminCommandLineParser.Parse(
[
"apikey",
"create-key",
"--key-id",
"operator01",
"--display-name",
"Operator",
"--scopes",
"session:open,invoke,metadata",
]);
Assert.True(result.IsApiKeyCommand);
Assert.Null(result.Command);
Assert.NotNull(result.Error);
Assert.Contains("invoke", result.Error, StringComparison.Ordinal);
Assert.Contains("metadata", result.Error, StringComparison.Ordinal);
}
/// <summary>Verifies a create-key command with only canonical scopes parses successfully.</summary>
[Fact]
public void Parse_CreateKeyCommand_AcceptsAllCanonicalScopes()
{
ApiKeyAdminParseResult result = ApiKeyAdminCommandLineParser.Parse(
[
"apikey",
"create-key",
"--key-id",
"operator01",
"--display-name",
"Operator",
"--scopes",
"session:open,session:close,invoke:read,invoke:write,invoke:secure,events:read,metadata:read,admin",
]);
Assert.True(result.IsApiKeyCommand);
Assert.Null(result.Error);
Assert.NotNull(result.Command);
Assert.Equal(8, result.Command.Scopes.Count);
}
/// <summary>
/// Verifies create key without display name returns error.
/// </summary>