Improve gateway reliability and client e2e coverage
This commit is contained in:
@@ -2,5 +2,7 @@ namespace MxGateway.Server.Configuration;
|
||||
|
||||
public enum EventBackpressurePolicy
|
||||
{
|
||||
FailFast
|
||||
FailFast,
|
||||
|
||||
DisconnectSubscriber
|
||||
}
|
||||
|
||||
@@ -108,9 +108,19 @@ public sealed class EventStreamService(
|
||||
if (!writer.TryWrite(publicEvent))
|
||||
{
|
||||
string message = $"Session {session.SessionId} event stream queue overflowed.";
|
||||
session.MarkFaulted(message);
|
||||
metrics.QueueOverflow("grpc-event-stream");
|
||||
metrics.Fault(SessionManagerErrorCode.EventQueueOverflow.ToString());
|
||||
if (options.Value.Events.BackpressurePolicy == EventBackpressurePolicy.FailFast)
|
||||
{
|
||||
session.MarkFaulted(message);
|
||||
metrics.Fault(SessionManagerErrorCode.EventQueueOverflow.ToString());
|
||||
}
|
||||
else
|
||||
{
|
||||
logger.LogDebug(
|
||||
"Disconnecting event stream for session {SessionId} after queue overflow.",
|
||||
session.SessionId);
|
||||
}
|
||||
|
||||
writer.TryComplete(new SessionManagerException(
|
||||
SessionManagerErrorCode.EventQueueOverflow,
|
||||
message));
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
using System.Collections.Concurrent;
|
||||
using System.Diagnostics.Metrics;
|
||||
|
||||
namespace MxGateway.Server.Metrics;
|
||||
@@ -25,8 +26,8 @@ public sealed class GatewayMetrics : IDisposable
|
||||
private readonly Histogram<double> _commandLatencyHistogram;
|
||||
private readonly Histogram<double> _eventStreamSendLatencyHistogram;
|
||||
private readonly Dictionary<string, long> _commandFailuresByMethod = new(StringComparer.OrdinalIgnoreCase);
|
||||
private readonly Dictionary<string, long> _eventsByFamily = new(StringComparer.OrdinalIgnoreCase);
|
||||
private readonly Dictionary<string, long> _eventsBySession = new(StringComparer.Ordinal);
|
||||
private readonly ConcurrentDictionary<string, long> _eventsByFamily = new(StringComparer.OrdinalIgnoreCase);
|
||||
private readonly ConcurrentDictionary<string, long> _eventsBySession = new(StringComparer.Ordinal);
|
||||
private readonly Dictionary<string, long> _retryAttemptsByArea = new(StringComparer.OrdinalIgnoreCase);
|
||||
|
||||
private int _openSessions;
|
||||
@@ -173,12 +174,9 @@ public sealed class GatewayMetrics : IDisposable
|
||||
|
||||
public void EventReceived(string sessionId, string family)
|
||||
{
|
||||
lock (_syncRoot)
|
||||
{
|
||||
_eventsReceived++;
|
||||
Increment(_eventsByFamily, family);
|
||||
Increment(_eventsBySession, sessionId);
|
||||
}
|
||||
Interlocked.Increment(ref _eventsReceived);
|
||||
Increment(_eventsByFamily, family);
|
||||
Increment(_eventsBySession, sessionId);
|
||||
|
||||
_eventsReceivedCounter.Add(
|
||||
1,
|
||||
@@ -225,10 +223,7 @@ public sealed class GatewayMetrics : IDisposable
|
||||
|
||||
public void RemoveSessionEvents(string sessionId)
|
||||
{
|
||||
lock (_syncRoot)
|
||||
{
|
||||
_eventsBySession.Remove(sessionId);
|
||||
}
|
||||
_eventsBySession.TryRemove(sessionId, out _);
|
||||
}
|
||||
|
||||
public void QueueOverflow(string queueName)
|
||||
@@ -296,7 +291,7 @@ public sealed class GatewayMetrics : IDisposable
|
||||
CommandsStarted: _commandsStarted,
|
||||
CommandsSucceeded: _commandsSucceeded,
|
||||
CommandsFailed: _commandsFailed,
|
||||
EventsReceived: _eventsReceived,
|
||||
EventsReceived: Interlocked.Read(ref _eventsReceived),
|
||||
QueueOverflows: _queueOverflows,
|
||||
Faults: _faults,
|
||||
WorkerKills: _workerKills,
|
||||
@@ -359,4 +354,9 @@ public sealed class GatewayMetrics : IDisposable
|
||||
values.TryGetValue(key, out long currentValue);
|
||||
values[key] = currentValue + 1;
|
||||
}
|
||||
|
||||
private static void Increment(ConcurrentDictionary<string, long> values, string key)
|
||||
{
|
||||
values.AddOrUpdate(key, 1, static (_, currentValue) => currentValue + 1);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -41,6 +41,9 @@ public sealed class SessionWorkerClientFactory : ISessionWorkerClientFactory
|
||||
NamedPipeServerStream? pipe = CreatePipe(session.PipeName);
|
||||
WorkerProcessHandle? processHandle = null;
|
||||
IWorkerClient? workerClient = null;
|
||||
using CancellationTokenSource startupCancellation =
|
||||
CancellationTokenSource.CreateLinkedTokenSource(cancellationToken);
|
||||
startupCancellation.CancelAfter(session.StartupTimeout);
|
||||
try
|
||||
{
|
||||
session.TransitionTo(SessionState.StartingWorker);
|
||||
@@ -52,11 +55,11 @@ public sealed class SessionWorkerClientFactory : ISessionWorkerClientFactory
|
||||
GatewayContractInfo.WorkerProtocolVersion,
|
||||
session.Nonce,
|
||||
pipe),
|
||||
cancellationToken)
|
||||
startupCancellation.Token)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
session.TransitionTo(SessionState.WaitingForPipe);
|
||||
await WaitForPipeConnectionAsync(pipe, session.StartupTimeout, cancellationToken).ConfigureAwait(false);
|
||||
await WaitForPipeConnectionAsync(pipe, startupCancellation.Token).ConfigureAwait(false);
|
||||
|
||||
session.TransitionTo(SessionState.Handshaking);
|
||||
WorkerFrameProtocolOptions frameOptions = new(
|
||||
@@ -88,14 +91,23 @@ public sealed class SessionWorkerClientFactory : ISessionWorkerClientFactory
|
||||
processHandle = null;
|
||||
|
||||
session.TransitionTo(SessionState.InitializingWorker);
|
||||
await workerClient.StartAsync(cancellationToken).ConfigureAwait(false);
|
||||
await workerClient.StartAsync(startupCancellation.Token).ConfigureAwait(false);
|
||||
|
||||
return workerClient;
|
||||
}
|
||||
catch
|
||||
catch (Exception exception)
|
||||
{
|
||||
if (workerClient is not null)
|
||||
{
|
||||
try
|
||||
{
|
||||
workerClient.Kill("OpenSessionFailed");
|
||||
}
|
||||
catch
|
||||
{
|
||||
// Preserve the startup failure while still disposing below.
|
||||
}
|
||||
|
||||
await workerClient.DisposeAsync().ConfigureAwait(false);
|
||||
}
|
||||
else
|
||||
@@ -119,6 +131,15 @@ public sealed class SessionWorkerClientFactory : ISessionWorkerClientFactory
|
||||
pipe?.Dispose();
|
||||
}
|
||||
|
||||
if (exception is OperationCanceledException
|
||||
&& startupCancellation.IsCancellationRequested
|
||||
&& !cancellationToken.IsCancellationRequested)
|
||||
{
|
||||
throw new TimeoutException(
|
||||
$"Worker session {session.SessionId} did not complete startup within {session.StartupTimeout}.",
|
||||
exception);
|
||||
}
|
||||
|
||||
throw;
|
||||
}
|
||||
}
|
||||
@@ -135,11 +156,8 @@ public sealed class SessionWorkerClientFactory : ISessionWorkerClientFactory
|
||||
|
||||
private static async Task WaitForPipeConnectionAsync(
|
||||
NamedPipeServerStream pipe,
|
||||
TimeSpan startupTimeout,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
using CancellationTokenSource timeout = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken);
|
||||
timeout.CancelAfter(startupTimeout);
|
||||
await pipe.WaitForConnectionAsync(timeout.Token).ConfigureAwait(false);
|
||||
await pipe.WaitForConnectionAsync(cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -13,6 +13,7 @@ namespace MxGateway.Server.Workers;
|
||||
public sealed class WorkerClient : IWorkerClient
|
||||
{
|
||||
private const string GatewayVersionFallback = "unknown";
|
||||
private static readonly TimeSpan DisposeTaskTimeout = TimeSpan.FromSeconds(5);
|
||||
private readonly object _syncRoot = new();
|
||||
private readonly WorkerClientConnection _connection;
|
||||
private readonly WorkerClientOptions _options;
|
||||
@@ -286,8 +287,19 @@ public sealed class WorkerClient : IWorkerClient
|
||||
WorkerClientErrorCode.GatewayShutdown,
|
||||
"Worker client was disposed."));
|
||||
|
||||
await WaitForBackgroundTasksAsync(CancellationToken.None).ConfigureAwait(false);
|
||||
await _connection.Stream.DisposeAsync().ConfigureAwait(false);
|
||||
using CancellationTokenSource disposeTimeout = new(DisposeTaskTimeout);
|
||||
try
|
||||
{
|
||||
await WaitForBackgroundTasksAsync(disposeTimeout.Token).ConfigureAwait(false);
|
||||
}
|
||||
catch (OperationCanceledException)
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"Timed out waiting for worker client background tasks to stop for session {SessionId}.",
|
||||
SessionId);
|
||||
}
|
||||
|
||||
_connection.ProcessHandle?.Dispose();
|
||||
_pendingCommandSlots.Dispose();
|
||||
_stopCts.Dispose();
|
||||
|
||||
Reference in New Issue
Block a user