Resolve Server-002, -004, -005, -006 code-review findings
Server-002: the gateway never terminated leftover MxGateway.Worker.exe processes at startup, contradicting gateway.md and CLAUDE.md. Added IRunningProcessInspector/SystemRunningProcessInspector, OrphanWorkerTerminator, and OrphanWorkerCleanupHostedService (best-effort, runs before sessions are accepted); updated gateway.md to describe the implemented behavior. Server-004: API-key scopes were persisted verbatim with no validation. Added GatewayScopes.All/IsKnown; the CLI parser and dashboard create path now reject unknown scope strings. Server-005: a non-SqlException/InvalidOperationException fault on the initial Galaxy hierarchy load faulted the BackgroundService. ExecuteAsync now catches all non-cancellation exceptions on first load and RefreshCoreAsync broadens its catch so the cache records Stale/Unavailable instead. Server-006: OpenSessionAsync incremented the open-sessions gauge before alarm auto-subscribe; an auto-subscribe failure leaked the gauge. The catch path now calls SessionRemoved() when the gauge was incremented. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,64 @@
|
||||
using Microsoft.Extensions.Logging.Abstractions;
|
||||
using Microsoft.Extensions.Options;
|
||||
using MxGateway.Server.Galaxy;
|
||||
|
||||
namespace MxGateway.Tests.Galaxy;
|
||||
|
||||
/// <summary>
|
||||
/// Server-005 regression: the initial <c>RefreshAsync</c> call in
|
||||
/// <see cref="GalaxyHierarchyRefreshService"/> must not let a transient,
|
||||
/// non-cancellation first-load failure (e.g. a <see cref="TimeoutException"/>
|
||||
/// or <see cref="System.ComponentModel.Win32Exception"/> from connection
|
||||
/// establishment) escape and fault the host <c>BackgroundService</c>.
|
||||
/// </summary>
|
||||
public sealed class GalaxyHierarchyRefreshServiceTests
|
||||
{
|
||||
[Fact]
|
||||
public async Task ExecuteAsync_WhenFirstRefreshThrowsNonCancellationException_DoesNotFaultBackgroundService()
|
||||
{
|
||||
ThrowingCache cache = new(new TimeoutException("connection establishment timed out"));
|
||||
GalaxyHierarchyRefreshService service = CreateService(cache);
|
||||
|
||||
using CancellationTokenSource cts = new();
|
||||
|
||||
await service.StartAsync(cts.Token);
|
||||
await cts.CancelAsync();
|
||||
|
||||
// The background loop must have stopped cleanly: ExecuteTask completes
|
||||
// (RanToCompletion or Canceled) rather than faulting on the first refresh.
|
||||
Task? executeTask = service.ExecuteTask;
|
||||
Assert.NotNull(executeTask);
|
||||
await executeTask;
|
||||
Assert.False(executeTask.IsFaulted);
|
||||
Assert.Equal(1, cache.RefreshCallCount);
|
||||
|
||||
await service.StopAsync(CancellationToken.None);
|
||||
}
|
||||
|
||||
private static GalaxyHierarchyRefreshService CreateService(IGalaxyHierarchyCache cache)
|
||||
{
|
||||
GalaxyRepositoryOptions options = new()
|
||||
{
|
||||
DashboardRefreshIntervalSeconds = 3600,
|
||||
};
|
||||
return new GalaxyHierarchyRefreshService(
|
||||
cache,
|
||||
Options.Create(options),
|
||||
NullLogger<GalaxyHierarchyRefreshService>.Instance);
|
||||
}
|
||||
|
||||
private sealed class ThrowingCache(Exception toThrow) : IGalaxyHierarchyCache
|
||||
{
|
||||
public int RefreshCallCount { get; private set; }
|
||||
|
||||
public GalaxyHierarchyCacheEntry Current => GalaxyHierarchyCacheEntry.Empty;
|
||||
|
||||
public Task RefreshAsync(CancellationToken cancellationToken)
|
||||
{
|
||||
RefreshCallCount++;
|
||||
throw toThrow;
|
||||
}
|
||||
|
||||
public Task WaitForFirstLoadAsync(CancellationToken cancellationToken) => Task.CompletedTask;
|
||||
}
|
||||
}
|
||||
@@ -112,6 +112,33 @@ public sealed class DashboardApiKeyManagementServiceTests
|
||||
&& entry.Details == "rotated");
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Server-004 regression: the dashboard create path must reject a request
|
||||
/// carrying a non-canonical scope string rather than persisting a key whose
|
||||
/// scope the authorization resolver never matches.
|
||||
/// </summary>
|
||||
[Fact]
|
||||
public async Task CreateAsync_UnknownScope_DoesNotCallStore()
|
||||
{
|
||||
FakeApiKeyAdminStore adminStore = new();
|
||||
DashboardApiKeyManagementService service = CreateService(adminStore);
|
||||
|
||||
DashboardApiKeyManagementRequest request = CreateRequest() with
|
||||
{
|
||||
Scopes = new HashSet<string>(
|
||||
[GatewayScopes.SessionOpen, "invoke", "metadata"],
|
||||
StringComparer.Ordinal),
|
||||
};
|
||||
|
||||
DashboardApiKeyManagementResult result = await service.CreateAsync(
|
||||
CreateAuthorizedUser(),
|
||||
request,
|
||||
CancellationToken.None);
|
||||
|
||||
Assert.False(result.Succeeded);
|
||||
Assert.Equal(0, adminStore.CreateCount);
|
||||
}
|
||||
|
||||
private static DashboardApiKeyManagementService CreateService(
|
||||
FakeApiKeyAdminStore? adminStore = null,
|
||||
FakeApiKeyAuditStore? auditStore = null,
|
||||
|
||||
@@ -125,6 +125,44 @@ public sealed class SessionManagerAlarmAutoSubscribeTests
|
||||
CreateOpenRequest(), "client-1", CancellationToken.None));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Server-006 regression: when auto-subscribe throws after
|
||||
/// <c>SessionOpened()</c> incremented the open-session gauge, the failed
|
||||
/// open must not leave <c>mxgateway.sessions.open</c> over-counted.
|
||||
/// </summary>
|
||||
[Fact]
|
||||
public async Task OpenSessionAsync_DoesNotLeakOpenSessionGauge_WhenAutoSubscribeFailsWithRequireOn()
|
||||
{
|
||||
AlarmAutoSubscribeWorkerClient worker = new()
|
||||
{
|
||||
SubscribeAlarmsReplyFactory = _ => new MxCommandReply
|
||||
{
|
||||
Kind = MxCommandKind.SubscribeAlarms,
|
||||
ProtocolStatus = new ProtocolStatus
|
||||
{
|
||||
Code = ProtocolStatusCode.MxaccessFailure,
|
||||
Message = "wnwrap subscribe failed",
|
||||
},
|
||||
},
|
||||
};
|
||||
using GatewayMetrics metrics = new();
|
||||
SessionManager manager = NewManager(
|
||||
worker,
|
||||
alarms: new AlarmsOptions
|
||||
{
|
||||
Enabled = true,
|
||||
SubscriptionExpression = @"\\HOST\Galaxy!Area1",
|
||||
RequireSubscribeOnOpen = true,
|
||||
},
|
||||
metrics: metrics);
|
||||
|
||||
await Assert.ThrowsAsync<SessionManagerException>(
|
||||
async () => await manager.OpenSessionAsync(
|
||||
CreateOpenRequest(), "client-1", CancellationToken.None));
|
||||
|
||||
Assert.Equal(0, metrics.GetSnapshot().OpenSessions);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task OpenSessionAsync_Throws_WhenEnabledButNoExpressionAndRequireOn()
|
||||
{
|
||||
@@ -161,7 +199,8 @@ public sealed class SessionManagerAlarmAutoSubscribeTests
|
||||
|
||||
private static SessionManager NewManager(
|
||||
AlarmAutoSubscribeWorkerClient worker,
|
||||
AlarmsOptions alarms)
|
||||
AlarmsOptions alarms,
|
||||
GatewayMetrics? metrics = null)
|
||||
{
|
||||
FakeSessionWorkerClientFactory factory = new(worker);
|
||||
GatewayOptions options = new GatewayOptions
|
||||
@@ -183,7 +222,7 @@ public sealed class SessionManagerAlarmAutoSubscribeTests
|
||||
new SessionRegistry(),
|
||||
factory,
|
||||
Options.Create(options),
|
||||
new GatewayMetrics());
|
||||
metrics ?? new GatewayMetrics());
|
||||
}
|
||||
|
||||
private static SessionOpenRequest CreateOpenRequest()
|
||||
|
||||
@@ -0,0 +1,137 @@
|
||||
using Microsoft.Extensions.Options;
|
||||
using MxGateway.Server.Configuration;
|
||||
using MxGateway.Server.Metrics;
|
||||
using MxGateway.Server.Workers;
|
||||
|
||||
namespace MxGateway.Tests.Gateway.Workers;
|
||||
|
||||
/// <summary>
|
||||
/// Server-002 regression: per <c>gateway.md</c> the gateway must terminate
|
||||
/// orphaned worker processes on startup. These tests pin that the terminator
|
||||
/// kills leftover workers (matched by executable path, or by image name when
|
||||
/// the path is unreadable) without touching unrelated processes or itself.
|
||||
/// </summary>
|
||||
public sealed class OrphanWorkerTerminatorTests
|
||||
{
|
||||
private const string WorkerExecutablePath = @"C:\app\src\MxGateway.Worker\bin\x86\Release\MxGateway.Worker.exe";
|
||||
|
||||
[Fact]
|
||||
public void TerminateOrphans_KillsWorkerProcessesMatchingConfiguredExecutablePath()
|
||||
{
|
||||
FakeProcessInspector inspector = new(
|
||||
[
|
||||
new RunningProcessInfo(101, WorkerExecutablePath),
|
||||
new RunningProcessInfo(102, WorkerExecutablePath),
|
||||
]);
|
||||
OrphanWorkerTerminator terminator = CreateTerminator(inspector);
|
||||
|
||||
int killed = terminator.TerminateOrphans();
|
||||
|
||||
Assert.Equal(2, killed);
|
||||
Assert.Equal([101, 102], inspector.KilledProcessIds.Order());
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TerminateOrphans_KillsImageNameMatchWhenExecutablePathUnreadable()
|
||||
{
|
||||
// The x64 gateway cannot introspect the x86 worker's main module, so the
|
||||
// path comes back null. Image-name match is the only signal — and it is
|
||||
// exactly the orphan worker case, so the process must still be killed.
|
||||
FakeProcessInspector inspector = new(
|
||||
[
|
||||
new RunningProcessInfo(201, ExecutablePath: null),
|
||||
]);
|
||||
OrphanWorkerTerminator terminator = CreateTerminator(inspector);
|
||||
|
||||
int killed = terminator.TerminateOrphans();
|
||||
|
||||
Assert.Equal(1, killed);
|
||||
Assert.Equal([201], inspector.KilledProcessIds);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TerminateOrphans_DoesNotKillUnrelatedProcessSharingImageName()
|
||||
{
|
||||
// A process with the same image name but a different executable path is
|
||||
// not our worker and must be left alone.
|
||||
FakeProcessInspector inspector = new(
|
||||
[
|
||||
new RunningProcessInfo(301, @"C:\other\place\MxGateway.Worker.exe"),
|
||||
]);
|
||||
OrphanWorkerTerminator terminator = CreateTerminator(inspector);
|
||||
|
||||
int killed = terminator.TerminateOrphans();
|
||||
|
||||
Assert.Equal(0, killed);
|
||||
Assert.Empty(inspector.KilledProcessIds);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TerminateOrphans_DoesNotKillCurrentProcess()
|
||||
{
|
||||
FakeProcessInspector inspector = new(
|
||||
[
|
||||
new RunningProcessInfo(Environment.ProcessId, WorkerExecutablePath),
|
||||
]);
|
||||
OrphanWorkerTerminator terminator = CreateTerminator(inspector);
|
||||
|
||||
int killed = terminator.TerminateOrphans();
|
||||
|
||||
Assert.Equal(0, killed);
|
||||
Assert.Empty(inspector.KilledProcessIds);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TerminateOrphans_ContinuesWhenOneKillThrows()
|
||||
{
|
||||
FakeProcessInspector inspector = new(
|
||||
[
|
||||
new RunningProcessInfo(401, WorkerExecutablePath),
|
||||
new RunningProcessInfo(402, WorkerExecutablePath),
|
||||
])
|
||||
{
|
||||
ThrowOnKillProcessId = 401,
|
||||
};
|
||||
OrphanWorkerTerminator terminator = CreateTerminator(inspector);
|
||||
|
||||
int killed = terminator.TerminateOrphans();
|
||||
|
||||
Assert.Equal(1, killed);
|
||||
Assert.Contains(402, inspector.KilledProcessIds);
|
||||
}
|
||||
|
||||
private static OrphanWorkerTerminator CreateTerminator(IRunningProcessInspector inspector)
|
||||
{
|
||||
GatewayOptions options = new()
|
||||
{
|
||||
Worker = new WorkerOptions
|
||||
{
|
||||
ExecutablePath = WorkerExecutablePath,
|
||||
},
|
||||
};
|
||||
return new OrphanWorkerTerminator(
|
||||
Options.Create(options),
|
||||
inspector,
|
||||
new GatewayMetrics());
|
||||
}
|
||||
|
||||
private sealed class FakeProcessInspector(IReadOnlyList<RunningProcessInfo> processes)
|
||||
: IRunningProcessInspector
|
||||
{
|
||||
public List<int> KilledProcessIds { get; } = [];
|
||||
|
||||
public int? ThrowOnKillProcessId { get; init; }
|
||||
|
||||
public IReadOnlyList<RunningProcessInfo> GetProcessesByName(string processName) => processes;
|
||||
|
||||
public void Kill(int processId)
|
||||
{
|
||||
if (ThrowOnKillProcessId == processId)
|
||||
{
|
||||
throw new InvalidOperationException("Process has already exited.");
|
||||
}
|
||||
|
||||
KilledProcessIds.Add(processId);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -52,6 +52,56 @@ public sealed class ApiKeyAdminCommandLineParserTests
|
||||
Assert.Contains("events:read", result.Command.Scopes);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Server-004 regression: a create-key command with a non-canonical scope
|
||||
/// string (e.g. CLAUDE.md's stale <c>invoke</c> instead of <c>invoke:read</c>)
|
||||
/// must be rejected at parse time rather than silently persisting an
|
||||
/// unusable scope the authorization resolver never matches.
|
||||
/// </summary>
|
||||
[Fact]
|
||||
public void Parse_CreateKeyCommand_RejectsUnknownScope()
|
||||
{
|
||||
ApiKeyAdminParseResult result = ApiKeyAdminCommandLineParser.Parse(
|
||||
[
|
||||
"apikey",
|
||||
"create-key",
|
||||
"--key-id",
|
||||
"operator01",
|
||||
"--display-name",
|
||||
"Operator",
|
||||
"--scopes",
|
||||
"session:open,invoke,metadata",
|
||||
]);
|
||||
|
||||
Assert.True(result.IsApiKeyCommand);
|
||||
Assert.Null(result.Command);
|
||||
Assert.NotNull(result.Error);
|
||||
Assert.Contains("invoke", result.Error, StringComparison.Ordinal);
|
||||
Assert.Contains("metadata", result.Error, StringComparison.Ordinal);
|
||||
}
|
||||
|
||||
/// <summary>Verifies a create-key command with only canonical scopes parses successfully.</summary>
|
||||
[Fact]
|
||||
public void Parse_CreateKeyCommand_AcceptsAllCanonicalScopes()
|
||||
{
|
||||
ApiKeyAdminParseResult result = ApiKeyAdminCommandLineParser.Parse(
|
||||
[
|
||||
"apikey",
|
||||
"create-key",
|
||||
"--key-id",
|
||||
"operator01",
|
||||
"--display-name",
|
||||
"Operator",
|
||||
"--scopes",
|
||||
"session:open,session:close,invoke:read,invoke:write,invoke:secure,events:read,metadata:read,admin",
|
||||
]);
|
||||
|
||||
Assert.True(result.IsApiKeyCommand);
|
||||
Assert.Null(result.Error);
|
||||
Assert.NotNull(result.Command);
|
||||
Assert.Equal(8, result.Command.Scopes.Count);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Verifies create key without display name returns error.
|
||||
/// </summary>
|
||||
|
||||
Reference in New Issue
Block a user