Improve gateway reliability and dashboard docs
This commit is contained in:
@@ -125,6 +125,16 @@ public sealed class GatewayOptionsValidator : IValidateOptions<GatewayOptions>
|
||||
"MxGateway:Sessions:DefaultCommandTimeoutSeconds must be greater than zero.",
|
||||
failures);
|
||||
AddIfNotPositive(options.MaxSessions, "MxGateway:Sessions:MaxSessions must be greater than zero.", failures);
|
||||
AddIfNotPositive(
|
||||
options.MaxPendingCommandsPerSession,
|
||||
"MxGateway:Sessions:MaxPendingCommandsPerSession must be greater than zero.",
|
||||
failures);
|
||||
|
||||
if (options.AllowMultipleEventSubscribers)
|
||||
{
|
||||
failures.Add(
|
||||
"MxGateway:Sessions:AllowMultipleEventSubscribers is not supported until event fan-out is implemented.");
|
||||
}
|
||||
}
|
||||
|
||||
private static void ValidateEvents(EventOptions options, List<string> failures)
|
||||
|
||||
@@ -6,5 +6,7 @@ public sealed class SessionOptions
|
||||
|
||||
public int MaxSessions { get; init; } = 64;
|
||||
|
||||
public int MaxPendingCommandsPerSession { get; init; } = 128;
|
||||
|
||||
public bool AllowMultipleEventSubscribers { get; init; }
|
||||
}
|
||||
|
||||
@@ -21,6 +21,11 @@ public static class DashboardDisplay
|
||||
return string.IsNullOrWhiteSpace(value) ? "-" : value;
|
||||
}
|
||||
|
||||
public static string Count(long value)
|
||||
{
|
||||
return value.ToString("N0", System.Globalization.CultureInfo.InvariantCulture);
|
||||
}
|
||||
|
||||
public static long MetricValue(DashboardSnapshot snapshot, string name, string? dimension = null)
|
||||
{
|
||||
return snapshot.Metrics.FirstOrDefault(metric =>
|
||||
|
||||
@@ -20,13 +20,13 @@ else
|
||||
|
||||
<section class="metric-grid">
|
||||
<MetricCard Label="Uptime" Value="@DashboardDisplay.Duration(Snapshot.GatewayUptime)" Detail="@Snapshot.GatewayVersion" />
|
||||
<MetricCard Label="Open Sessions" Value="@DashboardDisplay.MetricValue(Snapshot, "mxgateway.sessions.open").ToString(System.Globalization.CultureInfo.InvariantCulture)" />
|
||||
<MetricCard Label="Workers Running" Value="@DashboardDisplay.MetricValue(Snapshot, "mxgateway.workers.running").ToString(System.Globalization.CultureInfo.InvariantCulture)" />
|
||||
<MetricCard Label="Event Queue Depth" Value="@DashboardDisplay.MetricValue(Snapshot, "mxgateway.events.queue.depth").ToString(System.Globalization.CultureInfo.InvariantCulture)" />
|
||||
<MetricCard Label="Commands Failed" Value="@DashboardDisplay.MetricValue(Snapshot, "mxgateway.commands.failed").ToString(System.Globalization.CultureInfo.InvariantCulture)" />
|
||||
<MetricCard Label="Events Received" Value="@DashboardDisplay.MetricValue(Snapshot, "mxgateway.events.received").ToString(System.Globalization.CultureInfo.InvariantCulture)" />
|
||||
<MetricCard Label="Faults" Value="@DashboardDisplay.MetricValue(Snapshot, "mxgateway.faults").ToString(System.Globalization.CultureInfo.InvariantCulture)" />
|
||||
<MetricCard Label="Queue Overflows" Value="@DashboardDisplay.MetricValue(Snapshot, "mxgateway.queues.overflows").ToString(System.Globalization.CultureInfo.InvariantCulture)" />
|
||||
<MetricCard Label="Open Sessions" Value="@DashboardDisplay.Count(DashboardDisplay.MetricValue(Snapshot, "mxgateway.sessions.open"))" />
|
||||
<MetricCard Label="Workers Running" Value="@DashboardDisplay.Count(DashboardDisplay.MetricValue(Snapshot, "mxgateway.workers.running"))" />
|
||||
<MetricCard Label="Event Queue Depth" Value="@DashboardDisplay.Count(DashboardDisplay.MetricValue(Snapshot, "mxgateway.events.worker_queue.depth"))" />
|
||||
<MetricCard Label="Commands Failed" Value="@DashboardDisplay.Count(DashboardDisplay.MetricValue(Snapshot, "mxgateway.commands.failed"))" />
|
||||
<MetricCard Label="Events Received" Value="@DashboardDisplay.Count(DashboardDisplay.MetricValue(Snapshot, "mxgateway.events.received"))" />
|
||||
<MetricCard Label="Faults" Value="@DashboardDisplay.Count(DashboardDisplay.MetricValue(Snapshot, "mxgateway.faults"))" />
|
||||
<MetricCard Label="Queue Overflows" Value="@DashboardDisplay.Count(DashboardDisplay.MetricValue(Snapshot, "mxgateway.queues.overflows"))" />
|
||||
</section>
|
||||
|
||||
<section class="dashboard-section">
|
||||
|
||||
@@ -18,10 +18,11 @@ else
|
||||
</div>
|
||||
|
||||
<section class="metric-grid compact">
|
||||
<MetricCard Label="Events Received" Value="@DashboardDisplay.MetricValue(Snapshot, "mxgateway.events.received").ToString(System.Globalization.CultureInfo.InvariantCulture)" />
|
||||
<MetricCard Label="Event Queue Depth" Value="@DashboardDisplay.MetricValue(Snapshot, "mxgateway.events.queue.depth").ToString(System.Globalization.CultureInfo.InvariantCulture)" />
|
||||
<MetricCard Label="Queue Overflows" Value="@DashboardDisplay.MetricValue(Snapshot, "mxgateway.queues.overflows").ToString(System.Globalization.CultureInfo.InvariantCulture)" />
|
||||
<MetricCard Label="Stream Disconnects" Value="@DashboardDisplay.MetricValue(Snapshot, "mxgateway.grpc.streams.disconnected").ToString(System.Globalization.CultureInfo.InvariantCulture)" />
|
||||
<MetricCard Label="Events Received" Value="@DashboardDisplay.Count(DashboardDisplay.MetricValue(Snapshot, "mxgateway.events.received"))" />
|
||||
<MetricCard Label="Worker Event Queue Depth" Value="@DashboardDisplay.Count(DashboardDisplay.MetricValue(Snapshot, "mxgateway.events.worker_queue.depth"))" />
|
||||
<MetricCard Label="Stream Queue Depth" Value="@DashboardDisplay.Count(DashboardDisplay.MetricValue(Snapshot, "mxgateway.events.grpc_stream_queue.depth"))" />
|
||||
<MetricCard Label="Queue Overflows" Value="@DashboardDisplay.Count(DashboardDisplay.MetricValue(Snapshot, "mxgateway.queues.overflows"))" />
|
||||
<MetricCard Label="Stream Disconnects" Value="@DashboardDisplay.Count(DashboardDisplay.MetricValue(Snapshot, "mxgateway.grpc.streams.disconnected"))" />
|
||||
</section>
|
||||
|
||||
<section class="dashboard-section">
|
||||
@@ -47,7 +48,7 @@ else
|
||||
{
|
||||
<tr>
|
||||
<td>@metric.Dimension</td>
|
||||
<td>@metric.Value</td>
|
||||
<td>@DashboardDisplay.Count(metric.Value)</td>
|
||||
</tr>
|
||||
}
|
||||
</tbody>
|
||||
|
||||
@@ -39,6 +39,7 @@ else
|
||||
<tr><th scope="row">Opened</th><td>@DashboardDisplay.DateTime(CurrentSession.OpenedAt)</td></tr>
|
||||
<tr><th scope="row">Last activity</th><td>@DashboardDisplay.DateTime(CurrentSession.LastClientActivityAt)</td></tr>
|
||||
<tr><th scope="row">Lease expires</th><td>@DashboardDisplay.DateTime(CurrentSession.LeaseExpiresAt)</td></tr>
|
||||
<tr><th scope="row">Events received</th><td>@DashboardDisplay.Count(CurrentSession.EventsReceived)</td></tr>
|
||||
<tr><th scope="row">Last fault</th><td>@DashboardDisplay.Text(CurrentSession.LastFault)</td></tr>
|
||||
</tbody>
|
||||
</table>
|
||||
|
||||
@@ -33,6 +33,7 @@ else
|
||||
<th scope="col">Client</th>
|
||||
<th scope="col">Backend</th>
|
||||
<th scope="col">Worker</th>
|
||||
<th scope="col">Events</th>
|
||||
<th scope="col">Opened</th>
|
||||
<th scope="col">Activity</th>
|
||||
<th scope="col">Heartbeat</th>
|
||||
@@ -54,6 +55,7 @@ else
|
||||
<span class="ms-1"><StatusBadge Text="@session.WorkerState.ToString()" /></span>
|
||||
}
|
||||
</td>
|
||||
<td>@DashboardDisplay.Count(session.EventsReceived)</td>
|
||||
<td>@DashboardDisplay.DateTime(session.OpenedAt)</td>
|
||||
<td>@DashboardDisplay.DateTime(session.LastClientActivityAt)</td>
|
||||
<td>@DashboardDisplay.DateTime(session.LastWorkerHeartbeatAt)</td>
|
||||
|
||||
@@ -16,4 +16,5 @@ public sealed record DashboardSessionSummary(
|
||||
int? WorkerProcessId,
|
||||
WorkerClientState? WorkerState,
|
||||
DateTimeOffset? LastWorkerHeartbeatAt,
|
||||
long EventsReceived,
|
||||
string? LastFault);
|
||||
|
||||
@@ -45,15 +45,15 @@ public sealed class DashboardSnapshotService : IDashboardSnapshotService
|
||||
IReadOnlyList<GatewaySession> sessions = _sessionRegistry.Snapshot()
|
||||
.OrderByDescending(session => session.OpenedAt)
|
||||
.ToArray();
|
||||
GatewayMetricsSnapshot metricsSnapshot = _metrics.GetSnapshot();
|
||||
IReadOnlyList<DashboardSessionSummary> sessionSummaries = sessions
|
||||
.Take(ResolveLimit(_recentSessionLimit))
|
||||
.Select(CreateSessionSummary)
|
||||
.Select(session => CreateSessionSummary(session, metricsSnapshot))
|
||||
.ToArray();
|
||||
IReadOnlyList<DashboardWorkerSummary> workerSummaries = sessions
|
||||
.Where(session => session.WorkerClient is not null)
|
||||
.Where(session => session.WorkerClient is { State: not WorkerClientState.Closed })
|
||||
.Select(CreateWorkerSummary)
|
||||
.ToArray();
|
||||
GatewayMetricsSnapshot metricsSnapshot = _metrics.GetSnapshot();
|
||||
|
||||
return new DashboardSnapshot(
|
||||
GeneratedAt: generatedAt,
|
||||
@@ -100,9 +100,12 @@ public sealed class DashboardSnapshotService : IDashboardSnapshotService
|
||||
}
|
||||
}
|
||||
|
||||
private static DashboardSessionSummary CreateSessionSummary(GatewaySession session)
|
||||
private static DashboardSessionSummary CreateSessionSummary(
|
||||
GatewaySession session,
|
||||
GatewayMetricsSnapshot metricsSnapshot)
|
||||
{
|
||||
IWorkerClient? workerClient = session.WorkerClient;
|
||||
metricsSnapshot.EventsBySession.TryGetValue(session.SessionId, out long eventsReceived);
|
||||
|
||||
return new DashboardSessionSummary(
|
||||
SessionId: session.SessionId,
|
||||
@@ -117,6 +120,7 @@ public sealed class DashboardSnapshotService : IDashboardSnapshotService
|
||||
WorkerProcessId: workerClient?.ProcessId,
|
||||
WorkerState: workerClient?.State,
|
||||
LastWorkerHeartbeatAt: workerClient?.LastHeartbeatAt,
|
||||
EventsReceived: eventsReceived,
|
||||
LastFault: DashboardRedactor.Redact(session.FinalFault));
|
||||
}
|
||||
|
||||
@@ -138,7 +142,8 @@ public sealed class DashboardSnapshotService : IDashboardSnapshotService
|
||||
[
|
||||
new("mxgateway.sessions.open", snapshot.OpenSessions),
|
||||
new("mxgateway.workers.running", snapshot.WorkersRunning),
|
||||
new("mxgateway.events.queue.depth", snapshot.EventQueueDepth),
|
||||
new("mxgateway.events.worker_queue.depth", snapshot.WorkerEventQueueDepth),
|
||||
new("mxgateway.events.grpc_stream_queue.depth", snapshot.GrpcEventStreamQueueDepth),
|
||||
new("mxgateway.sessions.opened", snapshot.SessionsOpened),
|
||||
new("mxgateway.sessions.closed", snapshot.SessionsClosed),
|
||||
new("mxgateway.commands.started", snapshot.CommandsStarted),
|
||||
|
||||
@@ -47,7 +47,7 @@ public sealed class EventStreamService(
|
||||
() =>
|
||||
{
|
||||
int depth = Interlocked.Increment(ref streamQueueDepth);
|
||||
metrics.SetEventQueueDepth(depth);
|
||||
metrics.SetGrpcEventStreamQueueDepth(depth);
|
||||
},
|
||||
streamCts.Token);
|
||||
|
||||
@@ -56,7 +56,7 @@ public sealed class EventStreamService(
|
||||
await foreach (MxEvent mxEvent in eventQueue.Reader.ReadAllAsync(cancellationToken).ConfigureAwait(false))
|
||||
{
|
||||
int depth = Math.Max(0, Interlocked.Decrement(ref streamQueueDepth));
|
||||
metrics.SetEventQueueDepth(depth);
|
||||
metrics.SetGrpcEventStreamQueueDepth(depth);
|
||||
yield return mxEvent;
|
||||
}
|
||||
|
||||
|
||||
@@ -26,11 +26,13 @@ public sealed class GatewayMetrics : IDisposable
|
||||
private readonly Histogram<double> _eventStreamSendLatencyHistogram;
|
||||
private readonly Dictionary<string, long> _commandFailuresByMethod = new(StringComparer.OrdinalIgnoreCase);
|
||||
private readonly Dictionary<string, long> _eventsByFamily = new(StringComparer.OrdinalIgnoreCase);
|
||||
private readonly Dictionary<string, long> _eventsBySession = new(StringComparer.Ordinal);
|
||||
private readonly Dictionary<string, long> _retryAttemptsByArea = new(StringComparer.OrdinalIgnoreCase);
|
||||
|
||||
private int _openSessions;
|
||||
private int _workersRunning;
|
||||
private int _eventQueueDepth;
|
||||
private int _workerEventQueueDepth;
|
||||
private int _grpcEventStreamQueueDepth;
|
||||
private long _sessionsOpened;
|
||||
private long _sessionsClosed;
|
||||
private long _commandsStarted;
|
||||
@@ -68,7 +70,8 @@ public sealed class GatewayMetrics : IDisposable
|
||||
|
||||
_meter.CreateObservableGauge("mxgateway.sessions.open", GetOpenSessions);
|
||||
_meter.CreateObservableGauge("mxgateway.workers.running", GetWorkersRunning);
|
||||
_meter.CreateObservableGauge("mxgateway.events.queue.depth", GetEventQueueDepth);
|
||||
_meter.CreateObservableGauge("mxgateway.events.worker_queue.depth", GetWorkerEventQueueDepth);
|
||||
_meter.CreateObservableGauge("mxgateway.events.grpc_stream_queue.depth", GetGrpcEventStreamQueueDepth);
|
||||
}
|
||||
|
||||
public void SessionOpened()
|
||||
@@ -174,11 +177,11 @@ public sealed class GatewayMetrics : IDisposable
|
||||
{
|
||||
_eventsReceived++;
|
||||
Increment(_eventsByFamily, family);
|
||||
Increment(_eventsBySession, sessionId);
|
||||
}
|
||||
|
||||
_eventsReceivedCounter.Add(
|
||||
1,
|
||||
new KeyValuePair<string, object?>("session_id", sessionId),
|
||||
new KeyValuePair<string, object?>("family", family));
|
||||
}
|
||||
|
||||
@@ -190,6 +193,11 @@ public sealed class GatewayMetrics : IDisposable
|
||||
}
|
||||
|
||||
public void SetEventQueueDepth(int depth)
|
||||
{
|
||||
SetWorkerEventQueueDepth(depth);
|
||||
}
|
||||
|
||||
public void SetWorkerEventQueueDepth(int depth)
|
||||
{
|
||||
if (depth < 0)
|
||||
{
|
||||
@@ -198,7 +206,28 @@ public sealed class GatewayMetrics : IDisposable
|
||||
|
||||
lock (_syncRoot)
|
||||
{
|
||||
_eventQueueDepth = depth;
|
||||
_workerEventQueueDepth = depth;
|
||||
}
|
||||
}
|
||||
|
||||
public void SetGrpcEventStreamQueueDepth(int depth)
|
||||
{
|
||||
if (depth < 0)
|
||||
{
|
||||
throw new ArgumentOutOfRangeException(nameof(depth), depth, "Queue depth cannot be negative.");
|
||||
}
|
||||
|
||||
lock (_syncRoot)
|
||||
{
|
||||
_grpcEventStreamQueueDepth = depth;
|
||||
}
|
||||
}
|
||||
|
||||
public void RemoveSessionEvents(string sessionId)
|
||||
{
|
||||
lock (_syncRoot)
|
||||
{
|
||||
_eventsBySession.Remove(sessionId);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -260,7 +289,8 @@ public sealed class GatewayMetrics : IDisposable
|
||||
return new GatewayMetricsSnapshot(
|
||||
OpenSessions: _openSessions,
|
||||
WorkersRunning: _workersRunning,
|
||||
EventQueueDepth: _eventQueueDepth,
|
||||
WorkerEventQueueDepth: _workerEventQueueDepth,
|
||||
GrpcEventStreamQueueDepth: _grpcEventStreamQueueDepth,
|
||||
SessionsOpened: _sessionsOpened,
|
||||
SessionsClosed: _sessionsClosed,
|
||||
CommandsStarted: _commandsStarted,
|
||||
@@ -276,6 +306,7 @@ public sealed class GatewayMetrics : IDisposable
|
||||
RetryAttempts: _retryAttempts,
|
||||
CommandFailuresByMethod: new Dictionary<string, long>(_commandFailuresByMethod, StringComparer.OrdinalIgnoreCase),
|
||||
EventsByFamily: new Dictionary<string, long>(_eventsByFamily, StringComparer.OrdinalIgnoreCase),
|
||||
EventsBySession: new Dictionary<string, long>(_eventsBySession, StringComparer.Ordinal),
|
||||
RetryAttemptsByArea: new Dictionary<string, long>(_retryAttemptsByArea, StringComparer.OrdinalIgnoreCase));
|
||||
}
|
||||
}
|
||||
@@ -307,11 +338,19 @@ public sealed class GatewayMetrics : IDisposable
|
||||
}
|
||||
}
|
||||
|
||||
private int GetEventQueueDepth()
|
||||
private int GetWorkerEventQueueDepth()
|
||||
{
|
||||
lock (_syncRoot)
|
||||
{
|
||||
return _eventQueueDepth;
|
||||
return _workerEventQueueDepth;
|
||||
}
|
||||
}
|
||||
|
||||
private int GetGrpcEventStreamQueueDepth()
|
||||
{
|
||||
lock (_syncRoot)
|
||||
{
|
||||
return _grpcEventStreamQueueDepth;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -3,7 +3,8 @@ namespace MxGateway.Server.Metrics;
|
||||
public sealed record GatewayMetricsSnapshot(
|
||||
int OpenSessions,
|
||||
int WorkersRunning,
|
||||
int EventQueueDepth,
|
||||
int WorkerEventQueueDepth,
|
||||
int GrpcEventStreamQueueDepth,
|
||||
long SessionsOpened,
|
||||
long SessionsClosed,
|
||||
long CommandsStarted,
|
||||
@@ -19,4 +20,5 @@ public sealed record GatewayMetricsSnapshot(
|
||||
long RetryAttempts,
|
||||
IReadOnlyDictionary<string, long> CommandFailuresByMethod,
|
||||
IReadOnlyDictionary<string, long> EventsByFamily,
|
||||
IReadOnlyDictionary<string, long> EventsBySession,
|
||||
IReadOnlyDictionary<string, long> RetryAttemptsByArea);
|
||||
|
||||
@@ -23,6 +23,7 @@ public sealed class SessionManager : ISessionManager
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly ILogger<SessionManager> _logger;
|
||||
private readonly GatewayOptions _options;
|
||||
private readonly SemaphoreSlim _sessionSlots;
|
||||
|
||||
public SessionManager(
|
||||
ISessionRegistry registry,
|
||||
@@ -39,6 +40,7 @@ public sealed class SessionManager : ISessionManager
|
||||
_timeProvider = timeProvider ?? TimeProvider.System;
|
||||
_logger = logger ?? NullLogger<SessionManager>.Instance;
|
||||
_options = options.Value;
|
||||
_sessionSlots = new SemaphoreSlim(_options.Sessions.MaxSessions, _options.Sessions.MaxSessions);
|
||||
}
|
||||
|
||||
public async Task<GatewaySession> OpenSessionAsync(
|
||||
@@ -49,16 +51,17 @@ public sealed class SessionManager : ISessionManager
|
||||
ArgumentNullException.ThrowIfNull(request);
|
||||
EnsureSessionCapacity();
|
||||
|
||||
GatewaySession session = CreateSession(request, clientIdentity);
|
||||
if (!_registry.TryAdd(session))
|
||||
{
|
||||
throw new SessionManagerException(
|
||||
SessionManagerErrorCode.OpenFailed,
|
||||
$"Session id collision while opening session {session.SessionId}.");
|
||||
}
|
||||
|
||||
GatewaySession? session = null;
|
||||
try
|
||||
{
|
||||
session = CreateSession(request, clientIdentity);
|
||||
if (!_registry.TryAdd(session))
|
||||
{
|
||||
throw new SessionManagerException(
|
||||
SessionManagerErrorCode.OpenFailed,
|
||||
$"Session id collision while opening session {session.SessionId}.");
|
||||
}
|
||||
|
||||
session.TransitionTo(SessionState.StartingWorker);
|
||||
IWorkerClient workerClient = await _workerClientFactory
|
||||
.CreateAsync(session, cancellationToken)
|
||||
@@ -72,18 +75,23 @@ public sealed class SessionManager : ISessionManager
|
||||
}
|
||||
catch (Exception exception)
|
||||
{
|
||||
session.MarkFaulted(exception.Message);
|
||||
_registry.TryRemove(session.SessionId, out _);
|
||||
await session.DisposeAsync().ConfigureAwait(false);
|
||||
session?.MarkFaulted(exception.Message);
|
||||
if (session is not null)
|
||||
{
|
||||
_registry.TryRemove(session.SessionId, out _);
|
||||
await session.DisposeAsync().ConfigureAwait(false);
|
||||
}
|
||||
|
||||
ReleaseSessionSlot();
|
||||
_metrics.Fault(SessionManagerErrorCode.OpenFailed.ToString());
|
||||
_logger.LogWarning(
|
||||
exception,
|
||||
"Failed to open gateway session {SessionId}.",
|
||||
session.SessionId);
|
||||
session?.SessionId ?? "<not-created>");
|
||||
|
||||
throw new SessionManagerException(
|
||||
SessionManagerErrorCode.OpenFailed,
|
||||
$"Failed to open session {session.SessionId}.",
|
||||
session is null ? "Failed to create session." : $"Failed to open session {session.SessionId}.",
|
||||
exception);
|
||||
}
|
||||
}
|
||||
@@ -177,6 +185,7 @@ public sealed class SessionManager : ISessionManager
|
||||
"Graceful shutdown failed for session {SessionId}; killing worker.",
|
||||
session.SessionId);
|
||||
session.KillWorker(GatewayShutdownReason);
|
||||
await RemoveSessionAsync(session).ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -195,6 +204,7 @@ public sealed class SessionManager : ISessionManager
|
||||
_metrics.SessionClosed();
|
||||
}
|
||||
|
||||
await RemoveSessionAsync(session).ConfigureAwait(false);
|
||||
return result;
|
||||
}
|
||||
catch (Exception exception)
|
||||
@@ -222,7 +232,7 @@ public sealed class SessionManager : ISessionManager
|
||||
|
||||
private void EnsureSessionCapacity()
|
||||
{
|
||||
if (_registry.ActiveCount >= _options.Sessions.MaxSessions)
|
||||
if (!_sessionSlots.Wait(0))
|
||||
{
|
||||
throw new SessionManagerException(
|
||||
SessionManagerErrorCode.SessionLimitExceeded,
|
||||
@@ -230,6 +240,29 @@ public sealed class SessionManager : ISessionManager
|
||||
}
|
||||
}
|
||||
|
||||
private async Task RemoveSessionAsync(GatewaySession session)
|
||||
{
|
||||
if (!_registry.TryRemove(session.SessionId, out GatewaySession? removedSession))
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
_metrics.RemoveSessionEvents(session.SessionId);
|
||||
ReleaseSessionSlot();
|
||||
await removedSession.DisposeAsync().ConfigureAwait(false);
|
||||
}
|
||||
|
||||
private void ReleaseSessionSlot()
|
||||
{
|
||||
try
|
||||
{
|
||||
_sessionSlots.Release();
|
||||
}
|
||||
catch (SemaphoreFullException)
|
||||
{
|
||||
}
|
||||
}
|
||||
|
||||
private GatewaySession CreateSession(
|
||||
SessionOpenRequest request,
|
||||
string? clientIdentity)
|
||||
@@ -244,6 +277,7 @@ public sealed class SessionManager : ISessionManager
|
||||
string pipeName = $"mxaccess-gateway-{Environment.ProcessId}-{sessionId}";
|
||||
string nonce = CreateNonce();
|
||||
DateTimeOffset openedAt = _timeProvider.GetUtcNow();
|
||||
string clientCorrelationId = CreateClientCorrelationId(request.ClientSessionName, sessionId);
|
||||
|
||||
return new GatewaySession(
|
||||
sessionId,
|
||||
@@ -252,13 +286,24 @@ public sealed class SessionManager : ISessionManager
|
||||
nonce,
|
||||
clientIdentity,
|
||||
request.ClientSessionName,
|
||||
request.ClientCorrelationId,
|
||||
clientCorrelationId,
|
||||
commandTimeout,
|
||||
startupTimeout,
|
||||
shutdownTimeout,
|
||||
openedAt);
|
||||
}
|
||||
|
||||
private static string CreateClientCorrelationId(
|
||||
string? clientSessionName,
|
||||
string sessionId)
|
||||
{
|
||||
string clientName = string.IsNullOrWhiteSpace(clientSessionName)
|
||||
? "client"
|
||||
: clientSessionName!;
|
||||
|
||||
return $"{clientName}-{sessionId}";
|
||||
}
|
||||
|
||||
private TimeSpan ResolveCommandTimeout(Duration? requestedTimeout)
|
||||
{
|
||||
if (requestedTimeout is null)
|
||||
|
||||
@@ -7,6 +7,7 @@ public static class SessionServiceCollectionExtensions
|
||||
services.AddSingleton<ISessionRegistry, SessionRegistry>();
|
||||
services.AddSingleton<ISessionWorkerClientFactory, SessionWorkerClientFactory>();
|
||||
services.AddSingleton<ISessionManager, SessionManager>();
|
||||
services.AddHostedService<SessionShutdownHostedService>();
|
||||
|
||||
return services;
|
||||
}
|
||||
|
||||
@@ -0,0 +1,26 @@
|
||||
using Microsoft.Extensions.Hosting;
|
||||
using Microsoft.Extensions.Logging;
|
||||
|
||||
namespace MxGateway.Server.Sessions;
|
||||
|
||||
public sealed class SessionShutdownHostedService(
|
||||
ISessionManager sessionManager,
|
||||
ILogger<SessionShutdownHostedService> logger) : IHostedService
|
||||
{
|
||||
public Task StartAsync(CancellationToken cancellationToken)
|
||||
{
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
|
||||
public async Task StopAsync(CancellationToken cancellationToken)
|
||||
{
|
||||
try
|
||||
{
|
||||
await sessionManager.ShutdownAsync(cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested)
|
||||
{
|
||||
logger.LogWarning("Gateway session shutdown was canceled by host shutdown timeout.");
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -74,6 +74,7 @@ public sealed class SessionWorkerClientFactory : ISessionWorkerClientFactory
|
||||
HeartbeatGrace = TimeSpan.FromSeconds(_options.Worker.HeartbeatGraceSeconds),
|
||||
HeartbeatCheckInterval = TimeSpan.FromSeconds(_options.Worker.HeartbeatIntervalSeconds),
|
||||
EventChannelCapacity = _options.Events.QueueCapacity,
|
||||
MaxPendingCommands = _options.Sessions.MaxPendingCommandsPerSession,
|
||||
};
|
||||
|
||||
workerClient = new WorkerClient(
|
||||
|
||||
@@ -24,6 +24,7 @@ public sealed class WorkerClient : IWorkerClient
|
||||
private readonly Channel<WorkerEnvelope> _outboundEnvelopes;
|
||||
private readonly Channel<WorkerEvent> _events;
|
||||
private readonly ConcurrentDictionary<string, PendingCommand> _pendingCommands = new(StringComparer.Ordinal);
|
||||
private readonly SemaphoreSlim _pendingCommandSlots;
|
||||
private readonly CancellationTokenSource _stopCts = new();
|
||||
private long _nextSequence;
|
||||
private WorkerClientState _state;
|
||||
@@ -33,6 +34,7 @@ public sealed class WorkerClient : IWorkerClient
|
||||
private Task? _readLoopTask;
|
||||
private Task? _writeLoopTask;
|
||||
private Task? _heartbeatLoopTask;
|
||||
private bool _workerStartRecorded;
|
||||
private bool _disposed;
|
||||
|
||||
public WorkerClient(
|
||||
@@ -49,11 +51,13 @@ public sealed class WorkerClient : IWorkerClient
|
||||
_logger = logger ?? NullLogger<WorkerClient>.Instance;
|
||||
_reader = new WorkerFrameReader(connection.Stream, connection.FrameOptions);
|
||||
_writer = new WorkerFrameWriter(connection.Stream, connection.FrameOptions);
|
||||
_outboundEnvelopes = Channel.CreateUnbounded<WorkerEnvelope>(
|
||||
new UnboundedChannelOptions
|
||||
_pendingCommandSlots = new SemaphoreSlim(_options.MaxPendingCommands, _options.MaxPendingCommands);
|
||||
_outboundEnvelopes = Channel.CreateBounded<WorkerEnvelope>(
|
||||
new BoundedChannelOptions(_options.MaxPendingCommands + 4)
|
||||
{
|
||||
SingleReader = true,
|
||||
SingleWriter = false,
|
||||
FullMode = BoundedChannelFullMode.Wait,
|
||||
AllowSynchronousContinuations = false,
|
||||
});
|
||||
_events = Channel.CreateBounded<WorkerEvent>(
|
||||
@@ -140,6 +144,14 @@ public sealed class WorkerClient : IWorkerClient
|
||||
|
||||
string correlationId = Guid.NewGuid().ToString("N");
|
||||
string method = GetCommandMethod(command);
|
||||
if (!_pendingCommandSlots.Wait(0))
|
||||
{
|
||||
_metrics?.QueueOverflow("worker-pending-commands");
|
||||
throw new WorkerClientException(
|
||||
WorkerClientErrorCode.PendingCommandLimitExceeded,
|
||||
$"Worker session {SessionId} already has {_options.MaxPendingCommands} pending command(s).");
|
||||
}
|
||||
|
||||
PendingCommand pendingCommand = new(
|
||||
correlationId,
|
||||
method,
|
||||
@@ -147,6 +159,7 @@ public sealed class WorkerClient : IWorkerClient
|
||||
|
||||
if (!_pendingCommands.TryAdd(correlationId, pendingCommand))
|
||||
{
|
||||
ReleasePendingCommandSlot();
|
||||
throw new InvalidOperationException("Generated a duplicate command correlation id.");
|
||||
}
|
||||
|
||||
@@ -188,7 +201,11 @@ public sealed class WorkerClient : IWorkerClient
|
||||
}
|
||||
catch
|
||||
{
|
||||
_pendingCommands.TryRemove(correlationId, out _);
|
||||
if (_pendingCommands.TryRemove(correlationId, out _))
|
||||
{
|
||||
ReleasePendingCommandSlot();
|
||||
}
|
||||
|
||||
throw;
|
||||
}
|
||||
}
|
||||
@@ -199,7 +216,7 @@ public sealed class WorkerClient : IWorkerClient
|
||||
await foreach (WorkerEvent workerEvent in _events.Reader.ReadAllAsync(cancellationToken).ConfigureAwait(false))
|
||||
{
|
||||
int queueDepth = Math.Max(0, Interlocked.Decrement(ref _eventQueueDepth));
|
||||
_metrics?.SetEventQueueDepth(queueDepth);
|
||||
_metrics?.SetWorkerEventQueueDepth(queueDepth);
|
||||
yield return workerEvent;
|
||||
}
|
||||
}
|
||||
@@ -272,6 +289,7 @@ public sealed class WorkerClient : IWorkerClient
|
||||
await WaitForBackgroundTasksAsync(CancellationToken.None).ConfigureAwait(false);
|
||||
await _connection.Stream.DisposeAsync().ConfigureAwait(false);
|
||||
_connection.ProcessHandle?.Dispose();
|
||||
_pendingCommandSlots.Dispose();
|
||||
_stopCts.Dispose();
|
||||
}
|
||||
|
||||
@@ -409,7 +427,7 @@ public sealed class WorkerClient : IWorkerClient
|
||||
}
|
||||
|
||||
int queueDepth = Interlocked.Increment(ref _eventQueueDepth);
|
||||
_metrics?.SetEventQueueDepth(queueDepth);
|
||||
_metrics?.SetWorkerEventQueueDepth(queueDepth);
|
||||
}
|
||||
|
||||
private void CompleteCommand(WorkerEnvelope envelope)
|
||||
@@ -429,6 +447,7 @@ public sealed class WorkerClient : IWorkerClient
|
||||
return;
|
||||
}
|
||||
|
||||
ReleasePendingCommandSlot();
|
||||
TimeSpan duration = _timeProvider.GetElapsedTime(pendingCommand.StartTimestamp);
|
||||
_metrics?.CommandSucceeded(pendingCommand.Method, duration);
|
||||
pendingCommand.SetResult(envelope.WorkerCommandReply);
|
||||
@@ -445,6 +464,7 @@ public sealed class WorkerClient : IWorkerClient
|
||||
return;
|
||||
}
|
||||
|
||||
ReleasePendingCommandSlot();
|
||||
TimeSpan duration = _timeProvider.GetElapsedTime(pendingCommand.StartTimestamp);
|
||||
_metrics?.CommandFailed(pendingCommand.Method, errorCode.ToString(), duration);
|
||||
pendingCommand.SetException(new WorkerClientException(errorCode, message));
|
||||
@@ -498,6 +518,7 @@ public sealed class WorkerClient : IWorkerClient
|
||||
: ready.WorkerProcessId;
|
||||
_lastHeartbeatAt = _timeProvider.GetUtcNow();
|
||||
_state = WorkerClientState.Ready;
|
||||
_workerStartRecorded = true;
|
||||
}
|
||||
|
||||
DateTimeOffset readyAt = _timeProvider.GetUtcNow();
|
||||
@@ -549,7 +570,7 @@ public sealed class WorkerClient : IWorkerClient
|
||||
new WorkerClientException(
|
||||
WorkerClientErrorCode.GatewayShutdown,
|
||||
$"Worker client closed because {reason}."));
|
||||
_metrics?.WorkerStopped(reason);
|
||||
RecordWorkerStoppedOnce(reason);
|
||||
}
|
||||
|
||||
private void SetFaulted(
|
||||
@@ -575,16 +596,33 @@ public sealed class WorkerClient : IWorkerClient
|
||||
_outboundEnvelopes.Writer.TryComplete(fault);
|
||||
_events.Writer.TryComplete(fault);
|
||||
CompletePendingCommands(fault);
|
||||
RecordWorkerStoppedOnce(errorCode.ToString());
|
||||
_metrics?.Fault(errorCode.ToString());
|
||||
_logger.LogWarning(exception, "Worker client faulted for session {SessionId}: {Message}", SessionId, message);
|
||||
}
|
||||
|
||||
private void RecordWorkerStoppedOnce(string reason)
|
||||
{
|
||||
bool shouldRecord;
|
||||
lock (_syncRoot)
|
||||
{
|
||||
shouldRecord = _workerStartRecorded;
|
||||
_workerStartRecorded = false;
|
||||
}
|
||||
|
||||
if (shouldRecord)
|
||||
{
|
||||
_metrics?.WorkerStopped(reason);
|
||||
}
|
||||
}
|
||||
|
||||
private void CompletePendingCommands(Exception exception)
|
||||
{
|
||||
foreach (KeyValuePair<string, PendingCommand> item in _pendingCommands.ToArray())
|
||||
{
|
||||
if (_pendingCommands.TryRemove(item.Key, out PendingCommand? pendingCommand))
|
||||
{
|
||||
ReleasePendingCommandSlot();
|
||||
TimeSpan duration = _timeProvider.GetElapsedTime(pendingCommand.StartTimestamp);
|
||||
_metrics?.CommandFailed(pendingCommand.Method, exception.GetType().Name, duration);
|
||||
pendingCommand.SetException(exception);
|
||||
@@ -592,6 +630,17 @@ public sealed class WorkerClient : IWorkerClient
|
||||
}
|
||||
}
|
||||
|
||||
private void ReleasePendingCommandSlot()
|
||||
{
|
||||
try
|
||||
{
|
||||
_pendingCommandSlots.Release();
|
||||
}
|
||||
catch (SemaphoreFullException)
|
||||
{
|
||||
}
|
||||
}
|
||||
|
||||
private void TransitionFromCreatedToHandshaking()
|
||||
{
|
||||
lock (_syncRoot)
|
||||
|
||||
@@ -11,4 +11,5 @@ public enum WorkerClientErrorCode
|
||||
ShutdownTimeout,
|
||||
GatewayShutdown,
|
||||
WriteFailed,
|
||||
PendingCommandLimitExceeded,
|
||||
}
|
||||
|
||||
@@ -12,6 +12,7 @@ public sealed class WorkerClientOptions
|
||||
HeartbeatCheckInterval = DefaultHeartbeatCheckInterval;
|
||||
EventChannelCapacity = 1_024;
|
||||
EventChannelFullModeTimeout = DefaultEventChannelFullModeTimeout;
|
||||
MaxPendingCommands = 128;
|
||||
}
|
||||
|
||||
public TimeSpan HeartbeatGrace { get; init; }
|
||||
@@ -21,4 +22,6 @@ public sealed class WorkerClientOptions
|
||||
public int EventChannelCapacity { get; init; }
|
||||
|
||||
public TimeSpan EventChannelFullModeTimeout { get; init; }
|
||||
|
||||
public int MaxPendingCommands { get; init; }
|
||||
}
|
||||
|
||||
@@ -96,8 +96,6 @@ public sealed class WorkerProcessLauncher : IWorkerProcessLauncher
|
||||
startupTimeout.Token)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
_metrics.WorkerStarted(_timeProvider.GetUtcNow() - startedAt);
|
||||
|
||||
return new WorkerProcessHandle(process, commandLine, startedAt);
|
||||
}
|
||||
catch (OperationCanceledException exception) when (!cancellationToken.IsCancellationRequested)
|
||||
|
||||
Reference in New Issue
Block a user