Resolve Server-002, -004, -005, -006 code-review findings
Server-002: the gateway never terminated leftover MxGateway.Worker.exe processes at startup, contradicting gateway.md and CLAUDE.md. Added IRunningProcessInspector/SystemRunningProcessInspector, OrphanWorkerTerminator, and OrphanWorkerCleanupHostedService (best-effort, runs before sessions are accepted); updated gateway.md to describe the implemented behavior. Server-004: API-key scopes were persisted verbatim with no validation. Added GatewayScopes.All/IsKnown; the CLI parser and dashboard create path now reject unknown scope strings. Server-005: a non-SqlException/InvalidOperationException fault on the initial Galaxy hierarchy load faulted the BackgroundService. ExecuteAsync now catches all non-cancellation exceptions on first load and RefreshCoreAsync broadens its catch so the cache records Stale/Unavailable instead. Server-006: OpenSessionAsync incremented the open-sessions gauge before alarm auto-subscribe; an auto-subscribe failure leaked the gauge. The catch path now calls SessionRemoved() when the gauge was incremented. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1,5 +1,4 @@
|
||||
using Google.Protobuf.WellKnownTypes;
|
||||
using Microsoft.Data.SqlClient;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using MxGateway.Contracts.Proto.Galaxy;
|
||||
using MxGateway.Server.Dashboard;
|
||||
@@ -181,8 +180,13 @@ public sealed class GalaxyHierarchyCache : IGalaxyHierarchyCache
|
||||
{
|
||||
throw;
|
||||
}
|
||||
catch (Exception exception) when (exception is SqlException or InvalidOperationException)
|
||||
catch (Exception exception)
|
||||
{
|
||||
// Catch every non-cancellation failure — not just SqlException /
|
||||
// InvalidOperationException. A TimeoutException or Win32Exception
|
||||
// from connection establishment, or another DbException subtype,
|
||||
// must still degrade gracefully to Stale/Unavailable and complete
|
||||
// _firstLoad rather than escape and fault the refresh BackgroundService.
|
||||
_logger?.LogWarning(exception, "Galaxy hierarchy cache refresh failed.");
|
||||
GalaxyHierarchyCacheEntry failed = previous with
|
||||
{
|
||||
|
||||
@@ -26,6 +26,15 @@ public sealed class GalaxyHierarchyRefreshService(
|
||||
{
|
||||
return;
|
||||
}
|
||||
catch (Exception exception)
|
||||
{
|
||||
// A transient first-load failure (e.g. a TimeoutException or
|
||||
// Win32Exception from connection establishment, or a DbException
|
||||
// subtype the cache does not catch) must not fault this
|
||||
// BackgroundService and stop the whole gateway. The cache records
|
||||
// its own Unavailable/Stale status; the periodic tick below retries.
|
||||
logger.LogWarning(exception, "Initial Galaxy hierarchy cache load failed; will retry on the refresh interval.");
|
||||
}
|
||||
|
||||
using PeriodicTimer timer = new(interval, _timeProvider);
|
||||
try
|
||||
|
||||
Reference in New Issue
Block a user