feat(server): bounded worker-ready wait in GatewaySession (default off)

This commit is contained in:
Joseph Doherty
2026-06-16 16:48:02 -04:00
parent 0efa7d8cca
commit 4966ef3359
3 changed files with 283 additions and 14 deletions
@@ -1,3 +1,4 @@
using System.Diagnostics;
using Google.Protobuf.WellKnownTypes;
using Microsoft.Extensions.Options;
using Microsoft.Extensions.Time.Testing;
@@ -366,6 +367,124 @@ public sealed class SessionManagerTests
Assert.Equal(0, workerClient.InvokeCount);
}
/// <summary>
/// With the opt-in worker-ready wait enabled, a worker that is transiently
/// <c>Handshaking</c> but flips to <c>Ready</c> within the timeout window must let the
/// command through rather than fail fast.
/// </summary>
[Fact]
public async Task InvokeAsync_WhenWorkerHandshakingThenReadyWithinTimeout_Succeeds()
{
FakeWorkerClient workerClient = new() { State = WorkerClientState.Handshaking };
SessionManager manager = CreateManager(
new FakeSessionWorkerClientFactory(workerClient),
options: CreateOptions(workerReadyWaitTimeoutMs: 500));
GatewaySession session = await manager.OpenSessionAsync(CreateOpenRequest(), "client-1", ownerKeyId: null, CancellationToken.None);
Assert.Equal(SessionState.Ready, session.State);
// Flip the worker to Ready shortly after the invoke starts waiting.
_ = Task.Run(async () =>
{
await Task.Delay(50, CancellationToken.None);
workerClient.State = WorkerClientState.Ready;
});
WorkerCommandReply reply = await manager.InvokeAsync(
session.SessionId,
CreateCommand(MxCommandKind.Ping),
CancellationToken.None);
Assert.NotNull(reply);
Assert.Equal(1, workerClient.InvokeCount);
}
/// <summary>
/// A terminal worker state (<c>Faulted</c>) must fail fast even with a positive
/// worker-ready wait timeout, surfacing both states without burning the timeout.
/// </summary>
[Fact]
public async Task InvokeAsync_WhenWorkerFaulted_FailsFastWithBothStates()
{
FakeWorkerClient workerClient = new() { State = WorkerClientState.Faulted };
SessionManager manager = CreateManager(
new FakeSessionWorkerClientFactory(workerClient),
options: CreateOptions(workerReadyWaitTimeoutMs: 500));
GatewaySession session = await manager.OpenSessionAsync(CreateOpenRequest(), "client-1", ownerKeyId: null, CancellationToken.None);
Assert.Equal(SessionState.Ready, session.State);
Stopwatch stopwatch = Stopwatch.StartNew();
SessionManagerException exception = await Assert.ThrowsAsync<SessionManagerException>(
async () => await manager.InvokeAsync(
session.SessionId,
CreateCommand(MxCommandKind.Ping),
CancellationToken.None));
stopwatch.Stop();
Assert.True(stopwatch.ElapsedMilliseconds < 100, $"Expected immediate fail-fast but took {stopwatch.ElapsedMilliseconds}ms.");
Assert.Equal(SessionManagerErrorCode.SessionNotReady, exception.ErrorCode);
Assert.Contains("Session state is Ready", exception.Message);
Assert.Contains("worker state is Faulted", exception.Message);
Assert.Equal(0, workerClient.InvokeCount);
}
/// <summary>
/// When the worker stays transiently not-ready for the whole (small) timeout window,
/// the invoke fails after roughly the timeout with both states surfaced.
/// </summary>
[Fact]
public async Task InvokeAsync_WhenTimeoutElapsesStillNotReady_FailsWithBothStates()
{
FakeWorkerClient workerClient = new() { State = WorkerClientState.Handshaking };
SessionManager manager = CreateManager(
new FakeSessionWorkerClientFactory(workerClient),
options: CreateOptions(workerReadyWaitTimeoutMs: 100));
GatewaySession session = await manager.OpenSessionAsync(CreateOpenRequest(), "client-1", ownerKeyId: null, CancellationToken.None);
Assert.Equal(SessionState.Ready, session.State);
Stopwatch stopwatch = Stopwatch.StartNew();
SessionManagerException exception = await Assert.ThrowsAsync<SessionManagerException>(
async () => await manager.InvokeAsync(
session.SessionId,
CreateCommand(MxCommandKind.Ping),
CancellationToken.None));
stopwatch.Stop();
Assert.True(stopwatch.ElapsedMilliseconds >= 90, $"Expected the wait to span the timeout but took only {stopwatch.ElapsedMilliseconds}ms.");
Assert.Equal(SessionManagerErrorCode.SessionNotReady, exception.ErrorCode);
Assert.Contains("Session state is Ready", exception.Message);
Assert.Contains("worker state is Handshaking", exception.Message);
Assert.Equal(0, workerClient.InvokeCount);
}
/// <summary>
/// Pins the default (timeout == 0) behavior: a transiently <c>Handshaking</c> worker
/// fails fast immediately, byte-for-byte like the original fail-fast path.
/// </summary>
[Fact]
public async Task InvokeAsync_WhenTimeoutZero_FailsFastUnchanged()
{
FakeWorkerClient workerClient = new() { State = WorkerClientState.Handshaking };
SessionManager manager = CreateManager(
new FakeSessionWorkerClientFactory(workerClient),
options: CreateOptions(workerReadyWaitTimeoutMs: 0));
GatewaySession session = await manager.OpenSessionAsync(CreateOpenRequest(), "client-1", ownerKeyId: null, CancellationToken.None);
Assert.Equal(SessionState.Ready, session.State);
Stopwatch stopwatch = Stopwatch.StartNew();
SessionManagerException exception = await Assert.ThrowsAsync<SessionManagerException>(
async () => await manager.InvokeAsync(
session.SessionId,
CreateCommand(MxCommandKind.Ping),
CancellationToken.None));
stopwatch.Stop();
Assert.True(stopwatch.ElapsedMilliseconds < 100, $"Expected immediate fail-fast but took {stopwatch.ElapsedMilliseconds}ms.");
Assert.Equal(SessionManagerErrorCode.SessionNotReady, exception.ErrorCode);
Assert.Contains("Session state is Ready", exception.Message);
Assert.Contains("worker state is Handshaking", exception.Message);
Assert.Equal(0, workerClient.InvokeCount);
}
/// <summary>Verifies that closing a session removes it from the registry.</summary>
[Fact]
public async Task CloseSessionAsync_RemovesClosedSession()
@@ -876,7 +995,8 @@ public sealed class SessionManagerTests
private static GatewayOptions CreateOptions(
int maxSessions = 64,
int defaultLeaseSeconds = 1800,
int detachGraceSeconds = 0)
int detachGraceSeconds = 0,
int workerReadyWaitTimeoutMs = 0)
{
return new GatewayOptions
{
@@ -886,6 +1006,7 @@ public sealed class SessionManagerTests
MaxSessions = maxSessions,
DefaultLeaseSeconds = defaultLeaseSeconds,
DetachGraceSeconds = detachGraceSeconds,
WorkerReadyWaitTimeoutMs = workerReadyWaitTimeoutMs,
},
Worker = new WorkerOptions
{