Resolve Server-002, -004, -005, -006 code-review findings

Server-002: the gateway never terminated leftover MxGateway.Worker.exe
processes at startup, contradicting gateway.md and CLAUDE.md. Added
IRunningProcessInspector/SystemRunningProcessInspector, OrphanWorkerTerminator,
and OrphanWorkerCleanupHostedService (best-effort, runs before sessions are
accepted); updated gateway.md to describe the implemented behavior.

Server-004: API-key scopes were persisted verbatim with no validation. Added
GatewayScopes.All/IsKnown; the CLI parser and dashboard create path now
reject unknown scope strings.

Server-005: a non-SqlException/InvalidOperationException fault on the initial
Galaxy hierarchy load faulted the BackgroundService. ExecuteAsync now catches
all non-cancellation exceptions on first load and RefreshCoreAsync broadens
its catch so the cache records Stale/Unavailable instead.

Server-006: OpenSessionAsync incremented the open-sessions gauge before
alarm auto-subscribe; an auto-subscribe failure leaked the gauge. The catch
path now calls SessionRemoved() when the gauge was incremented.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Joseph Doherty
2026-05-18 21:31:10 -04:00
parent 5e795aeeb8
commit 1d9e3afadd
18 changed files with 676 additions and 15 deletions
@@ -125,6 +125,44 @@ public sealed class SessionManagerAlarmAutoSubscribeTests
CreateOpenRequest(), "client-1", CancellationToken.None));
}
/// <summary>
/// Server-006 regression: when auto-subscribe throws after
/// <c>SessionOpened()</c> incremented the open-session gauge, the failed
/// open must not leave <c>mxgateway.sessions.open</c> over-counted.
/// </summary>
[Fact]
public async Task OpenSessionAsync_DoesNotLeakOpenSessionGauge_WhenAutoSubscribeFailsWithRequireOn()
{
AlarmAutoSubscribeWorkerClient worker = new()
{
SubscribeAlarmsReplyFactory = _ => new MxCommandReply
{
Kind = MxCommandKind.SubscribeAlarms,
ProtocolStatus = new ProtocolStatus
{
Code = ProtocolStatusCode.MxaccessFailure,
Message = "wnwrap subscribe failed",
},
},
};
using GatewayMetrics metrics = new();
SessionManager manager = NewManager(
worker,
alarms: new AlarmsOptions
{
Enabled = true,
SubscriptionExpression = @"\\HOST\Galaxy!Area1",
RequireSubscribeOnOpen = true,
},
metrics: metrics);
await Assert.ThrowsAsync<SessionManagerException>(
async () => await manager.OpenSessionAsync(
CreateOpenRequest(), "client-1", CancellationToken.None));
Assert.Equal(0, metrics.GetSnapshot().OpenSessions);
}
[Fact]
public async Task OpenSessionAsync_Throws_WhenEnabledButNoExpressionAndRequireOn()
{
@@ -161,7 +199,8 @@ public sealed class SessionManagerAlarmAutoSubscribeTests
private static SessionManager NewManager(
AlarmAutoSubscribeWorkerClient worker,
AlarmsOptions alarms)
AlarmsOptions alarms,
GatewayMetrics? metrics = null)
{
FakeSessionWorkerClientFactory factory = new(worker);
GatewayOptions options = new GatewayOptions
@@ -183,7 +222,7 @@ public sealed class SessionManagerAlarmAutoSubscribeTests
new SessionRegistry(),
factory,
Options.Create(options),
new GatewayMetrics());
metrics ?? new GatewayMetrics());
}
private static SessionOpenRequest CreateOpenRequest()