Improve gateway reliability and client e2e coverage

This commit is contained in:
Joseph Doherty
2026-04-28 06:11:18 -04:00
parent 4fc355b357
commit 907aa49aea
25 changed files with 1153 additions and 83 deletions
@@ -2,5 +2,7 @@ namespace MxGateway.Server.Configuration;
public enum EventBackpressurePolicy
{
FailFast
FailFast,
DisconnectSubscriber
}
@@ -108,9 +108,19 @@ public sealed class EventStreamService(
if (!writer.TryWrite(publicEvent))
{
string message = $"Session {session.SessionId} event stream queue overflowed.";
session.MarkFaulted(message);
metrics.QueueOverflow("grpc-event-stream");
metrics.Fault(SessionManagerErrorCode.EventQueueOverflow.ToString());
if (options.Value.Events.BackpressurePolicy == EventBackpressurePolicy.FailFast)
{
session.MarkFaulted(message);
metrics.Fault(SessionManagerErrorCode.EventQueueOverflow.ToString());
}
else
{
logger.LogDebug(
"Disconnecting event stream for session {SessionId} after queue overflow.",
session.SessionId);
}
writer.TryComplete(new SessionManagerException(
SessionManagerErrorCode.EventQueueOverflow,
message));
+13 -13
View File
@@ -1,3 +1,4 @@
using System.Collections.Concurrent;
using System.Diagnostics.Metrics;
namespace MxGateway.Server.Metrics;
@@ -25,8 +26,8 @@ public sealed class GatewayMetrics : IDisposable
private readonly Histogram<double> _commandLatencyHistogram;
private readonly Histogram<double> _eventStreamSendLatencyHistogram;
private readonly Dictionary<string, long> _commandFailuresByMethod = new(StringComparer.OrdinalIgnoreCase);
private readonly Dictionary<string, long> _eventsByFamily = new(StringComparer.OrdinalIgnoreCase);
private readonly Dictionary<string, long> _eventsBySession = new(StringComparer.Ordinal);
private readonly ConcurrentDictionary<string, long> _eventsByFamily = new(StringComparer.OrdinalIgnoreCase);
private readonly ConcurrentDictionary<string, long> _eventsBySession = new(StringComparer.Ordinal);
private readonly Dictionary<string, long> _retryAttemptsByArea = new(StringComparer.OrdinalIgnoreCase);
private int _openSessions;
@@ -173,12 +174,9 @@ public sealed class GatewayMetrics : IDisposable
public void EventReceived(string sessionId, string family)
{
lock (_syncRoot)
{
_eventsReceived++;
Increment(_eventsByFamily, family);
Increment(_eventsBySession, sessionId);
}
Interlocked.Increment(ref _eventsReceived);
Increment(_eventsByFamily, family);
Increment(_eventsBySession, sessionId);
_eventsReceivedCounter.Add(
1,
@@ -225,10 +223,7 @@ public sealed class GatewayMetrics : IDisposable
public void RemoveSessionEvents(string sessionId)
{
lock (_syncRoot)
{
_eventsBySession.Remove(sessionId);
}
_eventsBySession.TryRemove(sessionId, out _);
}
public void QueueOverflow(string queueName)
@@ -296,7 +291,7 @@ public sealed class GatewayMetrics : IDisposable
CommandsStarted: _commandsStarted,
CommandsSucceeded: _commandsSucceeded,
CommandsFailed: _commandsFailed,
EventsReceived: _eventsReceived,
EventsReceived: Interlocked.Read(ref _eventsReceived),
QueueOverflows: _queueOverflows,
Faults: _faults,
WorkerKills: _workerKills,
@@ -359,4 +354,9 @@ public sealed class GatewayMetrics : IDisposable
values.TryGetValue(key, out long currentValue);
values[key] = currentValue + 1;
}
private static void Increment(ConcurrentDictionary<string, long> values, string key)
{
values.AddOrUpdate(key, 1, static (_, currentValue) => currentValue + 1);
}
}
@@ -41,6 +41,9 @@ public sealed class SessionWorkerClientFactory : ISessionWorkerClientFactory
NamedPipeServerStream? pipe = CreatePipe(session.PipeName);
WorkerProcessHandle? processHandle = null;
IWorkerClient? workerClient = null;
using CancellationTokenSource startupCancellation =
CancellationTokenSource.CreateLinkedTokenSource(cancellationToken);
startupCancellation.CancelAfter(session.StartupTimeout);
try
{
session.TransitionTo(SessionState.StartingWorker);
@@ -52,11 +55,11 @@ public sealed class SessionWorkerClientFactory : ISessionWorkerClientFactory
GatewayContractInfo.WorkerProtocolVersion,
session.Nonce,
pipe),
cancellationToken)
startupCancellation.Token)
.ConfigureAwait(false);
session.TransitionTo(SessionState.WaitingForPipe);
await WaitForPipeConnectionAsync(pipe, session.StartupTimeout, cancellationToken).ConfigureAwait(false);
await WaitForPipeConnectionAsync(pipe, startupCancellation.Token).ConfigureAwait(false);
session.TransitionTo(SessionState.Handshaking);
WorkerFrameProtocolOptions frameOptions = new(
@@ -88,14 +91,23 @@ public sealed class SessionWorkerClientFactory : ISessionWorkerClientFactory
processHandle = null;
session.TransitionTo(SessionState.InitializingWorker);
await workerClient.StartAsync(cancellationToken).ConfigureAwait(false);
await workerClient.StartAsync(startupCancellation.Token).ConfigureAwait(false);
return workerClient;
}
catch
catch (Exception exception)
{
if (workerClient is not null)
{
try
{
workerClient.Kill("OpenSessionFailed");
}
catch
{
// Preserve the startup failure while still disposing below.
}
await workerClient.DisposeAsync().ConfigureAwait(false);
}
else
@@ -119,6 +131,15 @@ public sealed class SessionWorkerClientFactory : ISessionWorkerClientFactory
pipe?.Dispose();
}
if (exception is OperationCanceledException
&& startupCancellation.IsCancellationRequested
&& !cancellationToken.IsCancellationRequested)
{
throw new TimeoutException(
$"Worker session {session.SessionId} did not complete startup within {session.StartupTimeout}.",
exception);
}
throw;
}
}
@@ -135,11 +156,8 @@ public sealed class SessionWorkerClientFactory : ISessionWorkerClientFactory
private static async Task WaitForPipeConnectionAsync(
NamedPipeServerStream pipe,
TimeSpan startupTimeout,
CancellationToken cancellationToken)
{
using CancellationTokenSource timeout = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken);
timeout.CancelAfter(startupTimeout);
await pipe.WaitForConnectionAsync(timeout.Token).ConfigureAwait(false);
await pipe.WaitForConnectionAsync(cancellationToken).ConfigureAwait(false);
}
}
+13 -1
View File
@@ -13,6 +13,7 @@ namespace MxGateway.Server.Workers;
public sealed class WorkerClient : IWorkerClient
{
private const string GatewayVersionFallback = "unknown";
private static readonly TimeSpan DisposeTaskTimeout = TimeSpan.FromSeconds(5);
private readonly object _syncRoot = new();
private readonly WorkerClientConnection _connection;
private readonly WorkerClientOptions _options;
@@ -286,8 +287,19 @@ public sealed class WorkerClient : IWorkerClient
WorkerClientErrorCode.GatewayShutdown,
"Worker client was disposed."));
await WaitForBackgroundTasksAsync(CancellationToken.None).ConfigureAwait(false);
await _connection.Stream.DisposeAsync().ConfigureAwait(false);
using CancellationTokenSource disposeTimeout = new(DisposeTaskTimeout);
try
{
await WaitForBackgroundTasksAsync(disposeTimeout.Token).ConfigureAwait(false);
}
catch (OperationCanceledException)
{
_logger.LogWarning(
"Timed out waiting for worker client background tasks to stop for session {SessionId}.",
SessionId);
}
_connection.ProcessHandle?.Dispose();
_pendingCommandSlots.Dispose();
_stopCts.Dispose();