File and fix Server-030 and Client.Dotnet-017 from e2e surfacing
Both findings surfaced when running the cross-language e2e matrix (scripts/run-client-e2e-tests.ps1) against the redeployed gateway at commit84d36b7. Filed in code-reviews/Server/findings.md and code-reviews/Client.Dotnet/findings.md and fixed in the same change. Server-030 (Medium / Error handling): GatewaySession.GetReadyWorkerClient gated on `_state == Ready && _workerClient.State == Ready` but only formatted `_state` into the SessionManagerException message. Under load the gateway-driven `_state` and the worker-driven `WorkerClient.State` can diverge, producing a self-contradictory diagnostic ("Session ... is not ready. Current state is Ready."). The Java e2e client hit this on the 56th item after 55 successful add-items. Rewrote the message to include both states ("Session state is X; worker state is Y"), added an XML doc explaining the two-state contract and that this branch is the fail-fast for a divergence race, and added regression test SessionManagerTests.InvokeAsync_WhenWorkerNotReadyButSessionReady_DiagnosticIncludesBothStates that pins both states appear in the message. The deeper race (should the gateway briefly wait for worker-Ready before failing?) remains open as a follow-up. Client.Dotnet-017 (Low / Error handling): stream-events CLI threw OperationCanceledException as an unhandled exception when the user's --timeout expired before --max-events was reached. Exit code -532462766, no aggregate JSON. The other client CLIs (Go, Rust, Python, Java) exit 0 in this case. Wrapped the `await foreach` in `catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested)` so the supplied token's cancellation (--timeout, Ctrl+C, or parent CTS) becomes graceful completion; the aggregate `{ "events": [...] }` JSON still runs after the catch. Added regression test RunAsync_StreamEvents_WhenTimeoutFiresAfterEvents_EmitsCollectedEventsAndExitsZero backed by a new FakeCliClient.StreamHangAfterEvents hook that yields the configured events then parks on the cancellation token. Side cleanup: the GatewayApplicationTests test added under Server-020 was asserting an invariant (`/dashboard/dashboard/X` doesn't exist) that I broke by reverting Server-020 in84d36b7. The doubled endpoint shapes do exist now (MapGroup("/dashboard") prefixing an already "/dashboard/X" @page directive) but they're harmless — no client requests `/dashboard/dashboard/X`. Replaced the test with a positive assertion (`/dashboard/X` routes ARE registered) and rewrote the XML doc to record the actual contract. Verified: dotnet test src/MxGateway.Tests passes 480/480, dotnet test clients/dotnet/MxGateway.Client.Tests passes 77/77, gateway redeployed at this commit and GET http://localhost:5130/dashboard returns 200. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -949,15 +949,31 @@ public sealed class GatewaySession
|
||||
return reply;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns the worker client iff both the gateway-side session state AND
|
||||
/// the worker client's own state are <see cref="SessionState.Ready"/> /
|
||||
/// <see cref="WorkerClientState.Ready"/>. The two states can diverge under
|
||||
/// load: <c>_state</c> only transitions on gateway-driven events (open,
|
||||
/// close, fault), while <see cref="WorkerClient.State"/> can shift on
|
||||
/// worker-side signals (heartbeat watchdog, pipe disconnect) before the
|
||||
/// gateway's session-level reaction observes them. When that happens the
|
||||
/// in-flight RPC fails fast here with both states surfaced in the
|
||||
/// diagnostic (Server-030) so the actual mismatch is actionable instead
|
||||
/// of misleading. The session usually transitions to <c>Faulted</c>
|
||||
/// shortly after.
|
||||
/// </summary>
|
||||
private IWorkerClient GetReadyWorkerClient()
|
||||
{
|
||||
lock (_syncRoot)
|
||||
{
|
||||
if (_state != SessionState.Ready || _workerClient?.State != WorkerClientState.Ready)
|
||||
{
|
||||
string workerState = _workerClient is null
|
||||
? "<no worker>"
|
||||
: _workerClient.State.ToString();
|
||||
throw new SessionManagerException(
|
||||
SessionManagerErrorCode.SessionNotReady,
|
||||
$"Session {SessionId} is not ready. Current state is {_state}.");
|
||||
$"Session {SessionId} is not ready. Session state is {_state}; worker state is {workerState}.");
|
||||
}
|
||||
|
||||
return _workerClient;
|
||||
|
||||
@@ -100,39 +100,45 @@ public sealed class GatewayApplicationTests
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Server-020 regression guard. The original Server-020 finding incorrectly
|
||||
/// concluded that the duplicate <c>@page "/dashboard/X"</c> directives were
|
||||
/// redundant because <c>MapGroup("/dashboard")</c> would prepend the prefix
|
||||
/// to all dashboard Razor pages. In practice Blazor SSR's <c>@page</c>
|
||||
/// template matcher does NOT compose with <c>MapGroup</c>, so removing the
|
||||
/// <c>/dashboard/X</c> directive left the dashboard unreachable at runtime
|
||||
/// (every page returned HTTP 500 with "Unable to find the provided template
|
||||
/// '/dashboard/'" from <c>RouteTableFactory.CreateEntry</c>). The duplicate
|
||||
/// <c>@page</c> directives are restored. This test only confirms the
|
||||
/// genuinely-double-prefixed shape (<c>/dashboard/dashboard/X</c>) never
|
||||
/// appears — it never did, since the original duplicates were
|
||||
/// <c>"/"</c> + <c>"/dashboard/"</c>, not <c>"/dashboard/"</c> repeated.
|
||||
/// Server-020 reversal regression guard. The original Server-020 finding
|
||||
/// incorrectly concluded that the duplicate <c>@page "/dashboard/X"</c>
|
||||
/// directives were redundant because <c>MapGroup("/dashboard")</c>
|
||||
/// would prepend the prefix to all dashboard Razor pages. In practice
|
||||
/// Blazor SSR's <c>RouteTableFactory</c> matches against the raw
|
||||
/// <c>@page</c> template values (not against the endpoint-route
|
||||
/// prefix), so removing <c>@page "/dashboard/X"</c> left the dashboard
|
||||
/// unreachable at runtime (every page returned HTTP 500 with "Unable
|
||||
/// to find the provided template '/dashboard/'"). The duplicate
|
||||
/// <c>@page</c> directives are restored, and as a side effect the
|
||||
/// endpoint route table DOES carry the doubled <c>/dashboard/dashboard/X</c>
|
||||
/// shape (because <c>MapGroup("/dashboard")</c> prefixes the already-prefixed
|
||||
/// <c>@page "/dashboard/X"</c>). Those doubled endpoints are harmless —
|
||||
/// no client requests <c>/dashboard/dashboard/X</c> — and removing them
|
||||
/// requires either dropping <c>MapGroup</c> or the <c>@page</c>
|
||||
/// prefix. This test asserts only the positive contract: every
|
||||
/// dashboard page IS reachable under the canonical <c>/dashboard/X</c>
|
||||
/// route, which is what the Blazor router actually serves.
|
||||
/// </summary>
|
||||
[Fact]
|
||||
public async Task Build_WhenDashboardEnabled_DoesNotRegisterDoubledDashboardPrefixRoutes()
|
||||
public async Task Build_WhenDashboardEnabled_RegistersCanonicalDashboardRoutes()
|
||||
{
|
||||
await using WebApplication app = GatewayApplication.Build([]);
|
||||
IReadOnlyList<RouteEndpoint> endpoints = GetRouteEndpoints(app);
|
||||
|
||||
string[] doubledRoutes =
|
||||
string[] canonicalRoutes =
|
||||
[
|
||||
"/dashboard/dashboard/",
|
||||
"/dashboard/dashboard/sessions",
|
||||
"/dashboard/dashboard/workers",
|
||||
"/dashboard/dashboard/events",
|
||||
"/dashboard/dashboard/settings",
|
||||
"/dashboard/dashboard/galaxy",
|
||||
"/dashboard/dashboard/apikeys",
|
||||
"/dashboard/dashboard/sessions/{SessionId}",
|
||||
"/dashboard/",
|
||||
"/dashboard/sessions",
|
||||
"/dashboard/workers",
|
||||
"/dashboard/events",
|
||||
"/dashboard/settings",
|
||||
"/dashboard/galaxy",
|
||||
"/dashboard/apikeys",
|
||||
"/dashboard/sessions/{SessionId}",
|
||||
];
|
||||
foreach (string doubled in doubledRoutes)
|
||||
foreach (string canonical in canonicalRoutes)
|
||||
{
|
||||
Assert.DoesNotContain(endpoints, endpoint => endpoint.RoutePattern.RawText == doubled);
|
||||
Assert.Contains(endpoints, endpoint => endpoint.RoutePattern.RawText == canonical);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -300,6 +300,36 @@ public sealed class SessionManagerTests
|
||||
Assert.Equal(0, workerClient.InvokeCount);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Server-030 regression: when the gateway-side <c>SessionState</c> is
|
||||
/// <c>Ready</c> but the worker client's own state is not, the diagnostic
|
||||
/// must surface both states so the mismatch is actionable instead of
|
||||
/// producing a self-contradictory "Session ... is not ready. Current
|
||||
/// state is Ready." message.
|
||||
/// </summary>
|
||||
[Fact]
|
||||
public async Task InvokeAsync_WhenWorkerNotReadyButSessionReady_DiagnosticIncludesBothStates()
|
||||
{
|
||||
FakeWorkerClient workerClient = new();
|
||||
SessionManager manager = CreateManager(new FakeSessionWorkerClientFactory(workerClient));
|
||||
GatewaySession session = await manager.OpenSessionAsync(CreateOpenRequest(), "client-1", CancellationToken.None);
|
||||
|
||||
// Force a state mismatch: session stays Ready, worker transitions out.
|
||||
workerClient.State = WorkerClientState.Handshaking;
|
||||
Assert.Equal(SessionState.Ready, session.State);
|
||||
|
||||
SessionManagerException exception = await Assert.ThrowsAsync<SessionManagerException>(
|
||||
async () => await manager.InvokeAsync(
|
||||
session.SessionId,
|
||||
CreateCommand(MxCommandKind.Ping),
|
||||
CancellationToken.None));
|
||||
|
||||
Assert.Equal(SessionManagerErrorCode.SessionNotReady, exception.ErrorCode);
|
||||
Assert.Contains("Session state is Ready", exception.Message);
|
||||
Assert.Contains("worker state is Handshaking", exception.Message);
|
||||
Assert.Equal(0, workerClient.InvokeCount);
|
||||
}
|
||||
|
||||
/// <summary>Verifies that closing a session removes it from the registry.</summary>
|
||||
[Fact]
|
||||
public async Task CloseSessionAsync_RemovesClosedSession()
|
||||
|
||||
Reference in New Issue
Block a user