Fix reliability findings

This commit is contained in:
Joseph Doherty
2026-04-28 06:27:01 -04:00
parent 907aa49aea
commit b0041c5d18
9 changed files with 233 additions and 21 deletions
@@ -66,6 +66,8 @@ public sealed class EventStreamService(
{
await streamCts.CancelAsync().ConfigureAwait(false);
subscriber.Dispose();
Interlocked.Exchange(ref streamQueueDepth, 0);
metrics.SetGrpcEventStreamQueueDepth(0);
metrics.StreamDisconnected("Detached");
try
@@ -101,6 +101,17 @@ public sealed class GatewayMetrics : IDisposable
_sessionsClosedCounter.Add(1);
}
public void SessionRemoved()
{
lock (_syncRoot)
{
if (_openSessions > 0)
{
_openSessions--;
}
}
}
public void WorkerStarted(TimeSpan startupDuration)
{
lock (_syncRoot)
@@ -184,8 +184,11 @@ public sealed class SessionManager : ISessionManager
exception,
"Graceful shutdown failed for session {SessionId}; killing worker.",
session.SessionId);
session.KillWorker(GatewayShutdownReason);
await RemoveSessionAsync(session).ConfigureAwait(false);
if (_registry.TryGet(session.SessionId, out _))
{
session.KillWorker(GatewayShutdownReason);
await RemoveSessionAsync(session).ConfigureAwait(false);
}
}
}
}
@@ -210,7 +213,13 @@ public sealed class SessionManager : ISessionManager
catch (Exception exception)
{
session.MarkFaulted(exception.Message);
if (!wasClosed)
{
_metrics.SessionRemoved();
}
_metrics.Fault(SessionManagerErrorCode.CloseFailed.ToString());
await RemoveSessionAsync(session).ConfigureAwait(false);
throw new SessionManagerException(
SessionManagerErrorCode.CloseFailed,
$"Failed to close session {session.SessionId}.",
@@ -85,6 +85,32 @@ public sealed class EventStreamServiceTests
await WaitUntilAsync(() => session.ActiveEventSubscriberCount == 0);
}
[Fact]
public async Task StreamEventsAsync_WhenDisposedWithBufferedEvents_ResetsStreamQueueDepth()
{
FakeWorkerClient workerClient = new();
GatewaySession session = CreateReadySession(workerClient);
using GatewayMetrics metrics = new();
EventStreamService service = CreateService(
new FakeSessionManager(session),
metrics,
queueCapacity: 8);
workerClient.Events.Add(CreateWorkerEvent(sequence: 1, MxEventFamily.OnDataChange));
workerClient.Events.Add(CreateWorkerEvent(sequence: 2, MxEventFamily.OnDataChange));
workerClient.Events.Add(CreateWorkerEvent(sequence: 3, MxEventFamily.OnDataChange));
workerClient.CompleteAfterConfiguredEvents = true;
await using IAsyncEnumerator<MxEvent> subscriber = service
.StreamEventsAsync(CreateRequest(session.SessionId), CancellationToken.None)
.GetAsyncEnumerator();
Assert.True(await subscriber.MoveNextAsync().AsTask().WaitAsync(TestTimeout));
await WaitUntilAsync(() => metrics.GetSnapshot().GrpcEventStreamQueueDepth > 0);
await subscriber.DisposeAsync();
await WaitUntilAsync(() => metrics.GetSnapshot().GrpcEventStreamQueueDepth == 0);
}
[Fact]
public async Task StreamEventsAsync_WhenStreamQueueOverflows_FaultsSessionAndReportsOverflow()
{
@@ -179,6 +179,48 @@ public sealed class SessionManagerTests
Assert.Equal(1, workerClient.KillCount);
}
[Fact]
public async Task CloseSessionAsync_WhenWorkerShutdownFails_RemovesSessionAndReleasesSlot()
{
FakeWorkerClient failingWorkerClient = new()
{
ShutdownException = new WorkerClientException(
WorkerClientErrorCode.ShutdownTimeout,
"Worker shutdown timed out."),
};
FakeWorkerClient replacementWorkerClient = new();
SessionRegistry registry = new();
using GatewayMetrics metrics = new();
SessionManager manager = CreateManager(
new QueueingSessionWorkerClientFactory(failingWorkerClient, replacementWorkerClient),
registry,
metrics,
CreateOptions(maxSessions: 1));
GatewaySession firstSession = await manager.OpenSessionAsync(
CreateOpenRequest(),
"client-1",
CancellationToken.None);
metrics.EventReceived(firstSession.SessionId, MxEventFamily.OnDataChange.ToString());
SessionManagerException exception = await Assert.ThrowsAsync<SessionManagerException>(
async () => await manager.CloseSessionAsync(firstSession.SessionId, CancellationToken.None));
GatewaySession secondSession = await manager.OpenSessionAsync(
CreateOpenRequest(),
"client-2",
CancellationToken.None);
Assert.Equal(SessionManagerErrorCode.CloseFailed, exception.ErrorCode);
Assert.False(manager.TryGetSession(firstSession.SessionId, out _));
Assert.True(manager.TryGetSession(secondSession.SessionId, out _));
Assert.Equal(1, registry.Count);
Assert.Equal(1, failingWorkerClient.KillCount);
Assert.Equal(1, failingWorkerClient.DisposeCount);
GatewayMetricsSnapshot snapshot = metrics.GetSnapshot();
Assert.Equal(0, snapshot.SessionsClosed);
Assert.False(snapshot.EventsBySession.ContainsKey(firstSession.SessionId));
Assert.Equal(1, snapshot.OpenSessions);
}
[Fact]
public async Task OpenSessionAsync_WhenWorkerCreationFails_RemovesSessionFromRegistry()
{
@@ -254,14 +296,14 @@ public sealed class SessionManagerTests
metrics ?? new GatewayMetrics());
}
private static GatewayOptions CreateOptions()
private static GatewayOptions CreateOptions(int maxSessions = 64)
{
return new GatewayOptions
{
Sessions = new SessionOptions
{
DefaultCommandTimeoutSeconds = 30,
MaxSessions = 64,
MaxSessions = maxSessions,
},
Worker = new WorkerOptions
{
@@ -359,6 +401,8 @@ public sealed class SessionManagerTests
public int KillCount { get; private set; }
public int DisposeCount { get; private set; }
public Exception? ShutdownException { get; init; }
public WorkerCommand? LastCommand { get; private set; }
@@ -424,6 +468,7 @@ public sealed class SessionManagerTests
public ValueTask DisposeAsync()
{
DisposeCount++;
return ValueTask.CompletedTask;
}
}
@@ -343,6 +343,45 @@ public sealed class WorkerPipeSessionTests
await runTask;
}
[Fact]
public async Task RunAsync_WhenCommandThrowsAfterShutdown_DropsLateFaultAndWritesShutdownAck()
{
using CancellationTokenSource cancellation = new(TimeSpan.FromSeconds(5));
using PipePair pipePair = await PipePair.CreateAsync(cancellation.Token);
FakeRuntimeSession runtime = new()
{
BlockDispatch = true,
ThrowAfterDispatchReleased = true,
};
WorkerPipeSession session = CreatePipeSession(
pipePair.WorkerStream,
runtime,
new WorkerPipeSessionOptions
{
HeartbeatInterval = TimeSpan.FromSeconds(1),
HeartbeatGrace = TimeSpan.FromSeconds(5),
});
Task runTask = session.RunAsync(cancellation.Token);
await CompleteGatewayHandshakeAsync(pipePair, cancellation.Token);
await pipePair.GatewayWriter.WriteAsync(
CreateCommandEnvelope("command-fails-during-shutdown"),
cancellation.Token);
Assert.True(runtime.DispatchStarted.Wait(TimeSpan.FromSeconds(2)));
await pipePair.GatewayWriter
.WriteAsync(CreateShutdownEnvelope(), cancellation.Token);
WorkerEnvelope firstEnvelopeAfterShutdown = await pipePair.GatewayReader
.ReadAsync(cancellation.Token);
Assert.Equal(WorkerEnvelope.BodyOneofCase.WorkerShutdownAck, firstEnvelopeAfterShutdown.BodyCase);
Assert.Equal(ProtocolStatusCode.Ok, firstEnvelopeAfterShutdown.WorkerShutdownAck.Status.Code);
Task completedTask = await Task.WhenAny(runTask, Task.Delay(TimeSpan.FromSeconds(2), cancellation.Token));
Assert.Same(runTask, completedTask);
await runTask;
}
private static WorkerPipeSession CreateSession(
Stream inbound,
Stream outbound,
@@ -574,6 +613,8 @@ public sealed class WorkerPipeSessionTests
public bool BlockDispatch { get; set; }
public bool ThrowAfterDispatchReleased { get; set; }
public Task<WorkerReady> StartAsync(
string sessionId,
int workerProcessId,
@@ -613,6 +654,11 @@ public sealed class WorkerPipeSessionTests
lastEventSequence: 0,
currentCommandCorrelationId: string.Empty));
if (ThrowAfterDispatchReleased)
{
throw new InvalidOperationException("Command failed after shutdown started.");
}
return new MxCommandReply
{
SessionId = command.SessionId,
@@ -386,6 +386,11 @@ public sealed class WorkerPipeSession
}
catch (Exception exception) when (exception is not OperationCanceledException)
{
if (_state is not WorkerState.Ready and not WorkerState.ExecutingCommand)
{
return;
}
_state = WorkerState.Faulted;
await TryWriteFaultAsync(
CreateFault(