Improve gateway reliability and dashboard docs

This commit is contained in:
Joseph Doherty
2026-04-28 00:13:22 -04:00
parent bd4a09a35e
commit 4fc355b357
61 changed files with 1722 additions and 150 deletions
@@ -125,6 +125,16 @@ public sealed class GatewayOptionsValidator : IValidateOptions<GatewayOptions>
"MxGateway:Sessions:DefaultCommandTimeoutSeconds must be greater than zero.",
failures);
AddIfNotPositive(options.MaxSessions, "MxGateway:Sessions:MaxSessions must be greater than zero.", failures);
AddIfNotPositive(
options.MaxPendingCommandsPerSession,
"MxGateway:Sessions:MaxPendingCommandsPerSession must be greater than zero.",
failures);
if (options.AllowMultipleEventSubscribers)
{
failures.Add(
"MxGateway:Sessions:AllowMultipleEventSubscribers is not supported until event fan-out is implemented.");
}
}
private static void ValidateEvents(EventOptions options, List<string> failures)
@@ -6,5 +6,7 @@ public sealed class SessionOptions
public int MaxSessions { get; init; } = 64;
public int MaxPendingCommandsPerSession { get; init; } = 128;
public bool AllowMultipleEventSubscribers { get; init; }
}
@@ -21,6 +21,11 @@ public static class DashboardDisplay
return string.IsNullOrWhiteSpace(value) ? "-" : value;
}
public static string Count(long value)
{
return value.ToString("N0", System.Globalization.CultureInfo.InvariantCulture);
}
public static long MetricValue(DashboardSnapshot snapshot, string name, string? dimension = null)
{
return snapshot.Metrics.FirstOrDefault(metric =>
@@ -20,13 +20,13 @@ else
<section class="metric-grid">
<MetricCard Label="Uptime" Value="@DashboardDisplay.Duration(Snapshot.GatewayUptime)" Detail="@Snapshot.GatewayVersion" />
<MetricCard Label="Open Sessions" Value="@DashboardDisplay.MetricValue(Snapshot, "mxgateway.sessions.open").ToString(System.Globalization.CultureInfo.InvariantCulture)" />
<MetricCard Label="Workers Running" Value="@DashboardDisplay.MetricValue(Snapshot, "mxgateway.workers.running").ToString(System.Globalization.CultureInfo.InvariantCulture)" />
<MetricCard Label="Event Queue Depth" Value="@DashboardDisplay.MetricValue(Snapshot, "mxgateway.events.queue.depth").ToString(System.Globalization.CultureInfo.InvariantCulture)" />
<MetricCard Label="Commands Failed" Value="@DashboardDisplay.MetricValue(Snapshot, "mxgateway.commands.failed").ToString(System.Globalization.CultureInfo.InvariantCulture)" />
<MetricCard Label="Events Received" Value="@DashboardDisplay.MetricValue(Snapshot, "mxgateway.events.received").ToString(System.Globalization.CultureInfo.InvariantCulture)" />
<MetricCard Label="Faults" Value="@DashboardDisplay.MetricValue(Snapshot, "mxgateway.faults").ToString(System.Globalization.CultureInfo.InvariantCulture)" />
<MetricCard Label="Queue Overflows" Value="@DashboardDisplay.MetricValue(Snapshot, "mxgateway.queues.overflows").ToString(System.Globalization.CultureInfo.InvariantCulture)" />
<MetricCard Label="Open Sessions" Value="@DashboardDisplay.Count(DashboardDisplay.MetricValue(Snapshot, "mxgateway.sessions.open"))" />
<MetricCard Label="Workers Running" Value="@DashboardDisplay.Count(DashboardDisplay.MetricValue(Snapshot, "mxgateway.workers.running"))" />
<MetricCard Label="Event Queue Depth" Value="@DashboardDisplay.Count(DashboardDisplay.MetricValue(Snapshot, "mxgateway.events.worker_queue.depth"))" />
<MetricCard Label="Commands Failed" Value="@DashboardDisplay.Count(DashboardDisplay.MetricValue(Snapshot, "mxgateway.commands.failed"))" />
<MetricCard Label="Events Received" Value="@DashboardDisplay.Count(DashboardDisplay.MetricValue(Snapshot, "mxgateway.events.received"))" />
<MetricCard Label="Faults" Value="@DashboardDisplay.Count(DashboardDisplay.MetricValue(Snapshot, "mxgateway.faults"))" />
<MetricCard Label="Queue Overflows" Value="@DashboardDisplay.Count(DashboardDisplay.MetricValue(Snapshot, "mxgateway.queues.overflows"))" />
</section>
<section class="dashboard-section">
@@ -18,10 +18,11 @@ else
</div>
<section class="metric-grid compact">
<MetricCard Label="Events Received" Value="@DashboardDisplay.MetricValue(Snapshot, "mxgateway.events.received").ToString(System.Globalization.CultureInfo.InvariantCulture)" />
<MetricCard Label="Event Queue Depth" Value="@DashboardDisplay.MetricValue(Snapshot, "mxgateway.events.queue.depth").ToString(System.Globalization.CultureInfo.InvariantCulture)" />
<MetricCard Label="Queue Overflows" Value="@DashboardDisplay.MetricValue(Snapshot, "mxgateway.queues.overflows").ToString(System.Globalization.CultureInfo.InvariantCulture)" />
<MetricCard Label="Stream Disconnects" Value="@DashboardDisplay.MetricValue(Snapshot, "mxgateway.grpc.streams.disconnected").ToString(System.Globalization.CultureInfo.InvariantCulture)" />
<MetricCard Label="Events Received" Value="@DashboardDisplay.Count(DashboardDisplay.MetricValue(Snapshot, "mxgateway.events.received"))" />
<MetricCard Label="Worker Event Queue Depth" Value="@DashboardDisplay.Count(DashboardDisplay.MetricValue(Snapshot, "mxgateway.events.worker_queue.depth"))" />
<MetricCard Label="Stream Queue Depth" Value="@DashboardDisplay.Count(DashboardDisplay.MetricValue(Snapshot, "mxgateway.events.grpc_stream_queue.depth"))" />
<MetricCard Label="Queue Overflows" Value="@DashboardDisplay.Count(DashboardDisplay.MetricValue(Snapshot, "mxgateway.queues.overflows"))" />
<MetricCard Label="Stream Disconnects" Value="@DashboardDisplay.Count(DashboardDisplay.MetricValue(Snapshot, "mxgateway.grpc.streams.disconnected"))" />
</section>
<section class="dashboard-section">
@@ -47,7 +48,7 @@ else
{
<tr>
<td>@metric.Dimension</td>
<td>@metric.Value</td>
<td>@DashboardDisplay.Count(metric.Value)</td>
</tr>
}
</tbody>
@@ -39,6 +39,7 @@ else
<tr><th scope="row">Opened</th><td>@DashboardDisplay.DateTime(CurrentSession.OpenedAt)</td></tr>
<tr><th scope="row">Last activity</th><td>@DashboardDisplay.DateTime(CurrentSession.LastClientActivityAt)</td></tr>
<tr><th scope="row">Lease expires</th><td>@DashboardDisplay.DateTime(CurrentSession.LeaseExpiresAt)</td></tr>
<tr><th scope="row">Events received</th><td>@DashboardDisplay.Count(CurrentSession.EventsReceived)</td></tr>
<tr><th scope="row">Last fault</th><td>@DashboardDisplay.Text(CurrentSession.LastFault)</td></tr>
</tbody>
</table>
@@ -33,6 +33,7 @@ else
<th scope="col">Client</th>
<th scope="col">Backend</th>
<th scope="col">Worker</th>
<th scope="col">Events</th>
<th scope="col">Opened</th>
<th scope="col">Activity</th>
<th scope="col">Heartbeat</th>
@@ -54,6 +55,7 @@ else
<span class="ms-1"><StatusBadge Text="@session.WorkerState.ToString()" /></span>
}
</td>
<td>@DashboardDisplay.Count(session.EventsReceived)</td>
<td>@DashboardDisplay.DateTime(session.OpenedAt)</td>
<td>@DashboardDisplay.DateTime(session.LastClientActivityAt)</td>
<td>@DashboardDisplay.DateTime(session.LastWorkerHeartbeatAt)</td>
@@ -16,4 +16,5 @@ public sealed record DashboardSessionSummary(
int? WorkerProcessId,
WorkerClientState? WorkerState,
DateTimeOffset? LastWorkerHeartbeatAt,
long EventsReceived,
string? LastFault);
@@ -45,15 +45,15 @@ public sealed class DashboardSnapshotService : IDashboardSnapshotService
IReadOnlyList<GatewaySession> sessions = _sessionRegistry.Snapshot()
.OrderByDescending(session => session.OpenedAt)
.ToArray();
GatewayMetricsSnapshot metricsSnapshot = _metrics.GetSnapshot();
IReadOnlyList<DashboardSessionSummary> sessionSummaries = sessions
.Take(ResolveLimit(_recentSessionLimit))
.Select(CreateSessionSummary)
.Select(session => CreateSessionSummary(session, metricsSnapshot))
.ToArray();
IReadOnlyList<DashboardWorkerSummary> workerSummaries = sessions
.Where(session => session.WorkerClient is not null)
.Where(session => session.WorkerClient is { State: not WorkerClientState.Closed })
.Select(CreateWorkerSummary)
.ToArray();
GatewayMetricsSnapshot metricsSnapshot = _metrics.GetSnapshot();
return new DashboardSnapshot(
GeneratedAt: generatedAt,
@@ -100,9 +100,12 @@ public sealed class DashboardSnapshotService : IDashboardSnapshotService
}
}
private static DashboardSessionSummary CreateSessionSummary(GatewaySession session)
private static DashboardSessionSummary CreateSessionSummary(
GatewaySession session,
GatewayMetricsSnapshot metricsSnapshot)
{
IWorkerClient? workerClient = session.WorkerClient;
metricsSnapshot.EventsBySession.TryGetValue(session.SessionId, out long eventsReceived);
return new DashboardSessionSummary(
SessionId: session.SessionId,
@@ -117,6 +120,7 @@ public sealed class DashboardSnapshotService : IDashboardSnapshotService
WorkerProcessId: workerClient?.ProcessId,
WorkerState: workerClient?.State,
LastWorkerHeartbeatAt: workerClient?.LastHeartbeatAt,
EventsReceived: eventsReceived,
LastFault: DashboardRedactor.Redact(session.FinalFault));
}
@@ -138,7 +142,8 @@ public sealed class DashboardSnapshotService : IDashboardSnapshotService
[
new("mxgateway.sessions.open", snapshot.OpenSessions),
new("mxgateway.workers.running", snapshot.WorkersRunning),
new("mxgateway.events.queue.depth", snapshot.EventQueueDepth),
new("mxgateway.events.worker_queue.depth", snapshot.WorkerEventQueueDepth),
new("mxgateway.events.grpc_stream_queue.depth", snapshot.GrpcEventStreamQueueDepth),
new("mxgateway.sessions.opened", snapshot.SessionsOpened),
new("mxgateway.sessions.closed", snapshot.SessionsClosed),
new("mxgateway.commands.started", snapshot.CommandsStarted),
@@ -47,7 +47,7 @@ public sealed class EventStreamService(
() =>
{
int depth = Interlocked.Increment(ref streamQueueDepth);
metrics.SetEventQueueDepth(depth);
metrics.SetGrpcEventStreamQueueDepth(depth);
},
streamCts.Token);
@@ -56,7 +56,7 @@ public sealed class EventStreamService(
await foreach (MxEvent mxEvent in eventQueue.Reader.ReadAllAsync(cancellationToken).ConfigureAwait(false))
{
int depth = Math.Max(0, Interlocked.Decrement(ref streamQueueDepth));
metrics.SetEventQueueDepth(depth);
metrics.SetGrpcEventStreamQueueDepth(depth);
yield return mxEvent;
}
+46 -7
View File
@@ -26,11 +26,13 @@ public sealed class GatewayMetrics : IDisposable
private readonly Histogram<double> _eventStreamSendLatencyHistogram;
private readonly Dictionary<string, long> _commandFailuresByMethod = new(StringComparer.OrdinalIgnoreCase);
private readonly Dictionary<string, long> _eventsByFamily = new(StringComparer.OrdinalIgnoreCase);
private readonly Dictionary<string, long> _eventsBySession = new(StringComparer.Ordinal);
private readonly Dictionary<string, long> _retryAttemptsByArea = new(StringComparer.OrdinalIgnoreCase);
private int _openSessions;
private int _workersRunning;
private int _eventQueueDepth;
private int _workerEventQueueDepth;
private int _grpcEventStreamQueueDepth;
private long _sessionsOpened;
private long _sessionsClosed;
private long _commandsStarted;
@@ -68,7 +70,8 @@ public sealed class GatewayMetrics : IDisposable
_meter.CreateObservableGauge("mxgateway.sessions.open", GetOpenSessions);
_meter.CreateObservableGauge("mxgateway.workers.running", GetWorkersRunning);
_meter.CreateObservableGauge("mxgateway.events.queue.depth", GetEventQueueDepth);
_meter.CreateObservableGauge("mxgateway.events.worker_queue.depth", GetWorkerEventQueueDepth);
_meter.CreateObservableGauge("mxgateway.events.grpc_stream_queue.depth", GetGrpcEventStreamQueueDepth);
}
public void SessionOpened()
@@ -174,11 +177,11 @@ public sealed class GatewayMetrics : IDisposable
{
_eventsReceived++;
Increment(_eventsByFamily, family);
Increment(_eventsBySession, sessionId);
}
_eventsReceivedCounter.Add(
1,
new KeyValuePair<string, object?>("session_id", sessionId),
new KeyValuePair<string, object?>("family", family));
}
@@ -190,6 +193,11 @@ public sealed class GatewayMetrics : IDisposable
}
public void SetEventQueueDepth(int depth)
{
SetWorkerEventQueueDepth(depth);
}
public void SetWorkerEventQueueDepth(int depth)
{
if (depth < 0)
{
@@ -198,7 +206,28 @@ public sealed class GatewayMetrics : IDisposable
lock (_syncRoot)
{
_eventQueueDepth = depth;
_workerEventQueueDepth = depth;
}
}
public void SetGrpcEventStreamQueueDepth(int depth)
{
if (depth < 0)
{
throw new ArgumentOutOfRangeException(nameof(depth), depth, "Queue depth cannot be negative.");
}
lock (_syncRoot)
{
_grpcEventStreamQueueDepth = depth;
}
}
public void RemoveSessionEvents(string sessionId)
{
lock (_syncRoot)
{
_eventsBySession.Remove(sessionId);
}
}
@@ -260,7 +289,8 @@ public sealed class GatewayMetrics : IDisposable
return new GatewayMetricsSnapshot(
OpenSessions: _openSessions,
WorkersRunning: _workersRunning,
EventQueueDepth: _eventQueueDepth,
WorkerEventQueueDepth: _workerEventQueueDepth,
GrpcEventStreamQueueDepth: _grpcEventStreamQueueDepth,
SessionsOpened: _sessionsOpened,
SessionsClosed: _sessionsClosed,
CommandsStarted: _commandsStarted,
@@ -276,6 +306,7 @@ public sealed class GatewayMetrics : IDisposable
RetryAttempts: _retryAttempts,
CommandFailuresByMethod: new Dictionary<string, long>(_commandFailuresByMethod, StringComparer.OrdinalIgnoreCase),
EventsByFamily: new Dictionary<string, long>(_eventsByFamily, StringComparer.OrdinalIgnoreCase),
EventsBySession: new Dictionary<string, long>(_eventsBySession, StringComparer.Ordinal),
RetryAttemptsByArea: new Dictionary<string, long>(_retryAttemptsByArea, StringComparer.OrdinalIgnoreCase));
}
}
@@ -307,11 +338,19 @@ public sealed class GatewayMetrics : IDisposable
}
}
private int GetEventQueueDepth()
private int GetWorkerEventQueueDepth()
{
lock (_syncRoot)
{
return _eventQueueDepth;
return _workerEventQueueDepth;
}
}
private int GetGrpcEventStreamQueueDepth()
{
lock (_syncRoot)
{
return _grpcEventStreamQueueDepth;
}
}
@@ -3,7 +3,8 @@ namespace MxGateway.Server.Metrics;
public sealed record GatewayMetricsSnapshot(
int OpenSessions,
int WorkersRunning,
int EventQueueDepth,
int WorkerEventQueueDepth,
int GrpcEventStreamQueueDepth,
long SessionsOpened,
long SessionsClosed,
long CommandsStarted,
@@ -19,4 +20,5 @@ public sealed record GatewayMetricsSnapshot(
long RetryAttempts,
IReadOnlyDictionary<string, long> CommandFailuresByMethod,
IReadOnlyDictionary<string, long> EventsByFamily,
IReadOnlyDictionary<string, long> EventsBySession,
IReadOnlyDictionary<string, long> RetryAttemptsByArea);
+60 -15
View File
@@ -23,6 +23,7 @@ public sealed class SessionManager : ISessionManager
private readonly TimeProvider _timeProvider;
private readonly ILogger<SessionManager> _logger;
private readonly GatewayOptions _options;
private readonly SemaphoreSlim _sessionSlots;
public SessionManager(
ISessionRegistry registry,
@@ -39,6 +40,7 @@ public sealed class SessionManager : ISessionManager
_timeProvider = timeProvider ?? TimeProvider.System;
_logger = logger ?? NullLogger<SessionManager>.Instance;
_options = options.Value;
_sessionSlots = new SemaphoreSlim(_options.Sessions.MaxSessions, _options.Sessions.MaxSessions);
}
public async Task<GatewaySession> OpenSessionAsync(
@@ -49,16 +51,17 @@ public sealed class SessionManager : ISessionManager
ArgumentNullException.ThrowIfNull(request);
EnsureSessionCapacity();
GatewaySession session = CreateSession(request, clientIdentity);
if (!_registry.TryAdd(session))
{
throw new SessionManagerException(
SessionManagerErrorCode.OpenFailed,
$"Session id collision while opening session {session.SessionId}.");
}
GatewaySession? session = null;
try
{
session = CreateSession(request, clientIdentity);
if (!_registry.TryAdd(session))
{
throw new SessionManagerException(
SessionManagerErrorCode.OpenFailed,
$"Session id collision while opening session {session.SessionId}.");
}
session.TransitionTo(SessionState.StartingWorker);
IWorkerClient workerClient = await _workerClientFactory
.CreateAsync(session, cancellationToken)
@@ -72,18 +75,23 @@ public sealed class SessionManager : ISessionManager
}
catch (Exception exception)
{
session.MarkFaulted(exception.Message);
_registry.TryRemove(session.SessionId, out _);
await session.DisposeAsync().ConfigureAwait(false);
session?.MarkFaulted(exception.Message);
if (session is not null)
{
_registry.TryRemove(session.SessionId, out _);
await session.DisposeAsync().ConfigureAwait(false);
}
ReleaseSessionSlot();
_metrics.Fault(SessionManagerErrorCode.OpenFailed.ToString());
_logger.LogWarning(
exception,
"Failed to open gateway session {SessionId}.",
session.SessionId);
session?.SessionId ?? "<not-created>");
throw new SessionManagerException(
SessionManagerErrorCode.OpenFailed,
$"Failed to open session {session.SessionId}.",
session is null ? "Failed to create session." : $"Failed to open session {session.SessionId}.",
exception);
}
}
@@ -177,6 +185,7 @@ public sealed class SessionManager : ISessionManager
"Graceful shutdown failed for session {SessionId}; killing worker.",
session.SessionId);
session.KillWorker(GatewayShutdownReason);
await RemoveSessionAsync(session).ConfigureAwait(false);
}
}
}
@@ -195,6 +204,7 @@ public sealed class SessionManager : ISessionManager
_metrics.SessionClosed();
}
await RemoveSessionAsync(session).ConfigureAwait(false);
return result;
}
catch (Exception exception)
@@ -222,7 +232,7 @@ public sealed class SessionManager : ISessionManager
private void EnsureSessionCapacity()
{
if (_registry.ActiveCount >= _options.Sessions.MaxSessions)
if (!_sessionSlots.Wait(0))
{
throw new SessionManagerException(
SessionManagerErrorCode.SessionLimitExceeded,
@@ -230,6 +240,29 @@ public sealed class SessionManager : ISessionManager
}
}
private async Task RemoveSessionAsync(GatewaySession session)
{
if (!_registry.TryRemove(session.SessionId, out GatewaySession? removedSession))
{
return;
}
_metrics.RemoveSessionEvents(session.SessionId);
ReleaseSessionSlot();
await removedSession.DisposeAsync().ConfigureAwait(false);
}
private void ReleaseSessionSlot()
{
try
{
_sessionSlots.Release();
}
catch (SemaphoreFullException)
{
}
}
private GatewaySession CreateSession(
SessionOpenRequest request,
string? clientIdentity)
@@ -244,6 +277,7 @@ public sealed class SessionManager : ISessionManager
string pipeName = $"mxaccess-gateway-{Environment.ProcessId}-{sessionId}";
string nonce = CreateNonce();
DateTimeOffset openedAt = _timeProvider.GetUtcNow();
string clientCorrelationId = CreateClientCorrelationId(request.ClientSessionName, sessionId);
return new GatewaySession(
sessionId,
@@ -252,13 +286,24 @@ public sealed class SessionManager : ISessionManager
nonce,
clientIdentity,
request.ClientSessionName,
request.ClientCorrelationId,
clientCorrelationId,
commandTimeout,
startupTimeout,
shutdownTimeout,
openedAt);
}
private static string CreateClientCorrelationId(
string? clientSessionName,
string sessionId)
{
string clientName = string.IsNullOrWhiteSpace(clientSessionName)
? "client"
: clientSessionName!;
return $"{clientName}-{sessionId}";
}
private TimeSpan ResolveCommandTimeout(Duration? requestedTimeout)
{
if (requestedTimeout is null)
@@ -7,6 +7,7 @@ public static class SessionServiceCollectionExtensions
services.AddSingleton<ISessionRegistry, SessionRegistry>();
services.AddSingleton<ISessionWorkerClientFactory, SessionWorkerClientFactory>();
services.AddSingleton<ISessionManager, SessionManager>();
services.AddHostedService<SessionShutdownHostedService>();
return services;
}
@@ -0,0 +1,26 @@
using Microsoft.Extensions.Hosting;
using Microsoft.Extensions.Logging;
namespace MxGateway.Server.Sessions;
public sealed class SessionShutdownHostedService(
ISessionManager sessionManager,
ILogger<SessionShutdownHostedService> logger) : IHostedService
{
public Task StartAsync(CancellationToken cancellationToken)
{
return Task.CompletedTask;
}
public async Task StopAsync(CancellationToken cancellationToken)
{
try
{
await sessionManager.ShutdownAsync(cancellationToken).ConfigureAwait(false);
}
catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested)
{
logger.LogWarning("Gateway session shutdown was canceled by host shutdown timeout.");
}
}
}
@@ -74,6 +74,7 @@ public sealed class SessionWorkerClientFactory : ISessionWorkerClientFactory
HeartbeatGrace = TimeSpan.FromSeconds(_options.Worker.HeartbeatGraceSeconds),
HeartbeatCheckInterval = TimeSpan.FromSeconds(_options.Worker.HeartbeatIntervalSeconds),
EventChannelCapacity = _options.Events.QueueCapacity,
MaxPendingCommands = _options.Sessions.MaxPendingCommandsPerSession,
};
workerClient = new WorkerClient(
+55 -6
View File
@@ -24,6 +24,7 @@ public sealed class WorkerClient : IWorkerClient
private readonly Channel<WorkerEnvelope> _outboundEnvelopes;
private readonly Channel<WorkerEvent> _events;
private readonly ConcurrentDictionary<string, PendingCommand> _pendingCommands = new(StringComparer.Ordinal);
private readonly SemaphoreSlim _pendingCommandSlots;
private readonly CancellationTokenSource _stopCts = new();
private long _nextSequence;
private WorkerClientState _state;
@@ -33,6 +34,7 @@ public sealed class WorkerClient : IWorkerClient
private Task? _readLoopTask;
private Task? _writeLoopTask;
private Task? _heartbeatLoopTask;
private bool _workerStartRecorded;
private bool _disposed;
public WorkerClient(
@@ -49,11 +51,13 @@ public sealed class WorkerClient : IWorkerClient
_logger = logger ?? NullLogger<WorkerClient>.Instance;
_reader = new WorkerFrameReader(connection.Stream, connection.FrameOptions);
_writer = new WorkerFrameWriter(connection.Stream, connection.FrameOptions);
_outboundEnvelopes = Channel.CreateUnbounded<WorkerEnvelope>(
new UnboundedChannelOptions
_pendingCommandSlots = new SemaphoreSlim(_options.MaxPendingCommands, _options.MaxPendingCommands);
_outboundEnvelopes = Channel.CreateBounded<WorkerEnvelope>(
new BoundedChannelOptions(_options.MaxPendingCommands + 4)
{
SingleReader = true,
SingleWriter = false,
FullMode = BoundedChannelFullMode.Wait,
AllowSynchronousContinuations = false,
});
_events = Channel.CreateBounded<WorkerEvent>(
@@ -140,6 +144,14 @@ public sealed class WorkerClient : IWorkerClient
string correlationId = Guid.NewGuid().ToString("N");
string method = GetCommandMethod(command);
if (!_pendingCommandSlots.Wait(0))
{
_metrics?.QueueOverflow("worker-pending-commands");
throw new WorkerClientException(
WorkerClientErrorCode.PendingCommandLimitExceeded,
$"Worker session {SessionId} already has {_options.MaxPendingCommands} pending command(s).");
}
PendingCommand pendingCommand = new(
correlationId,
method,
@@ -147,6 +159,7 @@ public sealed class WorkerClient : IWorkerClient
if (!_pendingCommands.TryAdd(correlationId, pendingCommand))
{
ReleasePendingCommandSlot();
throw new InvalidOperationException("Generated a duplicate command correlation id.");
}
@@ -188,7 +201,11 @@ public sealed class WorkerClient : IWorkerClient
}
catch
{
_pendingCommands.TryRemove(correlationId, out _);
if (_pendingCommands.TryRemove(correlationId, out _))
{
ReleasePendingCommandSlot();
}
throw;
}
}
@@ -199,7 +216,7 @@ public sealed class WorkerClient : IWorkerClient
await foreach (WorkerEvent workerEvent in _events.Reader.ReadAllAsync(cancellationToken).ConfigureAwait(false))
{
int queueDepth = Math.Max(0, Interlocked.Decrement(ref _eventQueueDepth));
_metrics?.SetEventQueueDepth(queueDepth);
_metrics?.SetWorkerEventQueueDepth(queueDepth);
yield return workerEvent;
}
}
@@ -272,6 +289,7 @@ public sealed class WorkerClient : IWorkerClient
await WaitForBackgroundTasksAsync(CancellationToken.None).ConfigureAwait(false);
await _connection.Stream.DisposeAsync().ConfigureAwait(false);
_connection.ProcessHandle?.Dispose();
_pendingCommandSlots.Dispose();
_stopCts.Dispose();
}
@@ -409,7 +427,7 @@ public sealed class WorkerClient : IWorkerClient
}
int queueDepth = Interlocked.Increment(ref _eventQueueDepth);
_metrics?.SetEventQueueDepth(queueDepth);
_metrics?.SetWorkerEventQueueDepth(queueDepth);
}
private void CompleteCommand(WorkerEnvelope envelope)
@@ -429,6 +447,7 @@ public sealed class WorkerClient : IWorkerClient
return;
}
ReleasePendingCommandSlot();
TimeSpan duration = _timeProvider.GetElapsedTime(pendingCommand.StartTimestamp);
_metrics?.CommandSucceeded(pendingCommand.Method, duration);
pendingCommand.SetResult(envelope.WorkerCommandReply);
@@ -445,6 +464,7 @@ public sealed class WorkerClient : IWorkerClient
return;
}
ReleasePendingCommandSlot();
TimeSpan duration = _timeProvider.GetElapsedTime(pendingCommand.StartTimestamp);
_metrics?.CommandFailed(pendingCommand.Method, errorCode.ToString(), duration);
pendingCommand.SetException(new WorkerClientException(errorCode, message));
@@ -498,6 +518,7 @@ public sealed class WorkerClient : IWorkerClient
: ready.WorkerProcessId;
_lastHeartbeatAt = _timeProvider.GetUtcNow();
_state = WorkerClientState.Ready;
_workerStartRecorded = true;
}
DateTimeOffset readyAt = _timeProvider.GetUtcNow();
@@ -549,7 +570,7 @@ public sealed class WorkerClient : IWorkerClient
new WorkerClientException(
WorkerClientErrorCode.GatewayShutdown,
$"Worker client closed because {reason}."));
_metrics?.WorkerStopped(reason);
RecordWorkerStoppedOnce(reason);
}
private void SetFaulted(
@@ -575,16 +596,33 @@ public sealed class WorkerClient : IWorkerClient
_outboundEnvelopes.Writer.TryComplete(fault);
_events.Writer.TryComplete(fault);
CompletePendingCommands(fault);
RecordWorkerStoppedOnce(errorCode.ToString());
_metrics?.Fault(errorCode.ToString());
_logger.LogWarning(exception, "Worker client faulted for session {SessionId}: {Message}", SessionId, message);
}
private void RecordWorkerStoppedOnce(string reason)
{
bool shouldRecord;
lock (_syncRoot)
{
shouldRecord = _workerStartRecorded;
_workerStartRecorded = false;
}
if (shouldRecord)
{
_metrics?.WorkerStopped(reason);
}
}
private void CompletePendingCommands(Exception exception)
{
foreach (KeyValuePair<string, PendingCommand> item in _pendingCommands.ToArray())
{
if (_pendingCommands.TryRemove(item.Key, out PendingCommand? pendingCommand))
{
ReleasePendingCommandSlot();
TimeSpan duration = _timeProvider.GetElapsedTime(pendingCommand.StartTimestamp);
_metrics?.CommandFailed(pendingCommand.Method, exception.GetType().Name, duration);
pendingCommand.SetException(exception);
@@ -592,6 +630,17 @@ public sealed class WorkerClient : IWorkerClient
}
}
private void ReleasePendingCommandSlot()
{
try
{
_pendingCommandSlots.Release();
}
catch (SemaphoreFullException)
{
}
}
private void TransitionFromCreatedToHandshaking()
{
lock (_syncRoot)
@@ -11,4 +11,5 @@ public enum WorkerClientErrorCode
ShutdownTimeout,
GatewayShutdown,
WriteFailed,
PendingCommandLimitExceeded,
}
@@ -12,6 +12,7 @@ public sealed class WorkerClientOptions
HeartbeatCheckInterval = DefaultHeartbeatCheckInterval;
EventChannelCapacity = 1_024;
EventChannelFullModeTimeout = DefaultEventChannelFullModeTimeout;
MaxPendingCommands = 128;
}
public TimeSpan HeartbeatGrace { get; init; }
@@ -21,4 +22,6 @@ public sealed class WorkerClientOptions
public int EventChannelCapacity { get; init; }
public TimeSpan EventChannelFullModeTimeout { get; init; }
public int MaxPendingCommands { get; init; }
}
@@ -96,8 +96,6 @@ public sealed class WorkerProcessLauncher : IWorkerProcessLauncher
startupTimeout.Token)
.ConfigureAwait(false);
_metrics.WorkerStarted(_timeProvider.GetUtcNow() - startedAt);
return new WorkerProcessHandle(process, commandLine, startedAt);
}
catch (OperationCanceledException exception) when (!cancellationToken.IsCancellationRequested)
@@ -42,8 +42,15 @@ public sealed class DashboardSnapshotServiceTests
DateTimeOffset.Parse("2026-04-26T10:01:00Z"));
faultedSession.AttachWorkerClient(new FakeWorkerClient("session-faulted", 1202, WorkerClientState.Faulted));
faultedSession.MarkFaulted("worker pipe disconnected");
GatewaySession closedSession = CreateSession(
"session-closed",
"client-three",
DateTimeOffset.Parse("2026-04-26T09:59:00Z"));
closedSession.AttachWorkerClient(new FakeWorkerClient("session-closed", 1203, WorkerClientState.Closed));
closedSession.TransitionTo(SessionState.Closed);
registry.TryAdd(activeSession);
registry.TryAdd(faultedSession);
registry.TryAdd(closedSession);
using GatewayMetrics metrics = new();
metrics.SessionOpened();
metrics.SessionOpened();
@@ -55,10 +62,15 @@ public sealed class DashboardSnapshotServiceTests
DashboardSnapshot snapshot = service.GetSnapshot();
Assert.Equal(2, snapshot.Sessions.Count);
Assert.Equal(3, snapshot.Sessions.Count);
Assert.Equal("session-faulted", snapshot.Sessions[0].SessionId);
Assert.Equal(SessionState.Faulted, snapshot.Sessions[0].State);
DashboardSessionSummary activeSummary = Assert.Single(
snapshot.Sessions,
session => session.SessionId == "session-active");
Assert.Equal(1, activeSummary.EventsReceived);
Assert.Equal(2, snapshot.Workers.Count);
Assert.DoesNotContain(snapshot.Workers, worker => worker.SessionId == "session-closed");
Assert.Contains(snapshot.Metrics, metric => metric.Name == "mxgateway.commands.started" && metric.Value == 1);
Assert.Contains(
snapshot.Metrics,
@@ -32,6 +32,35 @@ public sealed class SessionManagerTests
Assert.Equal(1, metrics.GetSnapshot().SessionsOpened);
}
[Fact]
public async Task OpenSessionAsync_GeneratesClientCorrelationIdFromClientNameAndSessionId()
{
SessionOpenRequest request = CreateOpenRequest() with
{
ClientSessionName = "rust-load-client",
ClientCorrelationId = "caller-provided-correlation",
};
SessionManager manager = CreateManager(new FakeSessionWorkerClientFactory(new FakeWorkerClient()));
GatewaySession session = await manager.OpenSessionAsync(request, "client-1", CancellationToken.None);
Assert.Equal($"rust-load-client-{session.SessionId}", session.ClientCorrelationId);
}
[Fact]
public async Task OpenSessionAsync_WhenClientSessionNameMissing_UsesClientCorrelationPrefix()
{
SessionOpenRequest request = CreateOpenRequest() with
{
ClientSessionName = "",
};
SessionManager manager = CreateManager(new FakeSessionWorkerClientFactory(new FakeWorkerClient()));
GatewaySession session = await manager.OpenSessionAsync(request, "client-1", CancellationToken.None);
Assert.Equal($"client-{session.SessionId}", session.ClientCorrelationId);
}
[Fact]
public async Task InvokeAsync_WhenSessionReady_ForwardsCommandToWorker()
{
@@ -111,7 +140,7 @@ public sealed class SessionManagerTests
}
[Fact]
public async Task CloseSessionAsync_WhenCalledTwice_IsIdempotent()
public async Task CloseSessionAsync_RemovesClosedSession()
{
FakeWorkerClient workerClient = new();
using GatewayMetrics metrics = new();
@@ -119,12 +148,12 @@ public sealed class SessionManagerTests
GatewaySession session = await manager.OpenSessionAsync(CreateOpenRequest(), "client-1", CancellationToken.None);
SessionCloseResult firstClose = await manager.CloseSessionAsync(session.SessionId, CancellationToken.None);
SessionCloseResult secondClose = await manager.CloseSessionAsync(session.SessionId, CancellationToken.None);
SessionManagerException secondClose = await Assert.ThrowsAsync<SessionManagerException>(
async () => await manager.CloseSessionAsync(session.SessionId, CancellationToken.None));
Assert.False(firstClose.AlreadyClosed);
Assert.True(secondClose.AlreadyClosed);
Assert.Equal(SessionState.Closed, firstClose.FinalState);
Assert.Equal(SessionState.Closed, secondClose.FinalState);
Assert.Equal(SessionManagerErrorCode.SessionNotFound, secondClose.ErrorCode);
Assert.Equal(1, workerClient.ShutdownCount);
Assert.Equal(1, metrics.GetSnapshot().SessionsClosed);
Assert.Equal(0, metrics.GetSnapshot().OpenSessions);
@@ -2,6 +2,7 @@ using System.IO.Pipes;
using Google.Protobuf.WellKnownTypes;
using MxGateway.Contracts;
using MxGateway.Contracts.Proto;
using MxGateway.Server.Metrics;
using MxGateway.Server.Workers;
namespace MxGateway.Tests.Gateway.Workers;
@@ -152,6 +153,27 @@ public sealed class WorkerClientTests
Assert.Equal(WorkerClientState.Faulted, client.State);
}
[Fact]
public async Task ReadLoop_WhenPipeDisconnects_StopsRunningWorkerMetric()
{
await using PipePair pipePair = await PipePair.CreateAsync();
using GatewayMetrics metrics = new();
await using WorkerClient client = CreateClient(pipePair, metrics: metrics);
await CompleteHandshakeAsync(client, pipePair);
Assert.Equal(1, metrics.GetSnapshot().WorkersRunning);
await pipePair.DisposeWorkerSideAsync();
await WaitUntilAsync(
() => client.State == WorkerClientState.Faulted,
TestTimeout);
GatewayMetricsSnapshot snapshot = metrics.GetSnapshot();
Assert.Equal(0, snapshot.WorkersRunning);
Assert.Equal(1, snapshot.WorkerExits);
}
[Fact]
public async Task ReadLoop_WhenHeartbeatArrives_UpdatesLastHeartbeatAndWorkerProcess()
{
@@ -193,7 +215,8 @@ public sealed class WorkerClientTests
private static WorkerClient CreateClient(
PipePair pipePair,
WorkerClientOptions? options = null)
WorkerClientOptions? options = null,
GatewayMetrics? metrics = null)
{
WorkerFrameProtocolOptions frameOptions = new(SessionId);
WorkerClientConnection connection = new(
@@ -202,7 +225,7 @@ public sealed class WorkerClientTests
pipePair.GatewayStream,
frameOptions);
return new WorkerClient(connection, options);
return new WorkerClient(connection, options, metrics);
}
private static async Task CompleteHandshakeAsync(
@@ -43,7 +43,7 @@ public sealed class WorkerProcessLauncherTests
Assert.DoesNotContain(Nonce, handle.CommandLine.ToString(), StringComparison.Ordinal);
Assert.DoesNotContain(Nonce, string.Join(" ", handle.CommandLine.Arguments), StringComparison.Ordinal);
Assert.False(pipeReservation.DisposeCalled);
Assert.Equal(1, metrics.GetSnapshot().WorkersRunning);
Assert.Equal(0, metrics.GetSnapshot().WorkersRunning);
}
[Fact]
@@ -17,7 +17,8 @@ public sealed class GatewayMetricsTests
metrics.CommandFailed("WriteSecured", "AuthorizationFailed", TimeSpan.FromMilliseconds(12));
metrics.EventReceived("session-1", "OnDataChange");
metrics.EventReceived("session-1", "OnDataChange");
metrics.SetEventQueueDepth(7);
metrics.SetWorkerEventQueueDepth(7);
metrics.SetGrpcEventStreamQueueDepth(3);
metrics.QueueOverflow("session-events");
metrics.Fault("CommandTimeout");
metrics.WorkerKilled("CommandTimeout");
@@ -30,7 +31,8 @@ public sealed class GatewayMetricsTests
Assert.Equal(0, snapshot.OpenSessions);
Assert.Equal(0, snapshot.WorkersRunning);
Assert.Equal(7, snapshot.EventQueueDepth);
Assert.Equal(7, snapshot.WorkerEventQueueDepth);
Assert.Equal(3, snapshot.GrpcEventStreamQueueDepth);
Assert.Equal(1, snapshot.SessionsOpened);
Assert.Equal(1, snapshot.SessionsClosed);
Assert.Equal(2, snapshot.CommandsStarted);
@@ -45,6 +47,7 @@ public sealed class GatewayMetricsTests
Assert.Equal(1, snapshot.StreamDisconnects);
Assert.Equal(1, snapshot.CommandFailuresByMethod["WriteSecured"]);
Assert.Equal(2, snapshot.EventsByFamily["OnDataChange"]);
Assert.Equal(2, snapshot.EventsBySession["session-1"]);
}
[Fact]
@@ -53,7 +56,7 @@ public sealed class GatewayMetricsTests
using GatewayMetrics metrics = new();
ArgumentOutOfRangeException exception = Assert.Throws<ArgumentOutOfRangeException>(
() => metrics.SetEventQueueDepth(-1));
() => metrics.SetWorkerEventQueueDepth(-1));
Assert.Equal("depth", exception.ParamName);
}
@@ -1,4 +1,5 @@
using System;
using System.Collections.Generic;
using System.IO;
using System.IO.Pipes;
using System.Threading;
@@ -228,6 +229,21 @@ public sealed class WorkerPipeClientTests
currentCommandCorrelationId: string.Empty);
}
public IReadOnlyList<WorkerEvent> DrainEvents(uint maxEvents)
{
return Array.Empty<WorkerEvent>();
}
public WorkerFault? DrainFault()
{
return null;
}
public bool CancelCommand(string correlationId)
{
return false;
}
public void RequestShutdown()
{
}
@@ -238,6 +238,37 @@ public sealed class WorkerPipeSessionTests
await SendShutdownAndWaitAsync(pipePair, runTask, cancellation.Token);
}
[Fact]
public async Task RunAsync_WhenRuntimeHasEvents_WritesWorkerEventEnvelope()
{
using CancellationTokenSource cancellation = new(TimeSpan.FromSeconds(5));
using PipePair pipePair = await PipePair.CreateAsync(cancellation.Token);
FakeRuntimeSession runtime = new();
WorkerPipeSession session = CreatePipeSession(
pipePair.WorkerStream,
runtime,
new WorkerPipeSessionOptions
{
HeartbeatInterval = TimeSpan.FromMilliseconds(100),
HeartbeatGrace = TimeSpan.FromSeconds(5),
});
Task runTask = session.RunAsync(cancellation.Token);
await CompleteGatewayHandshakeAsync(pipePair, cancellation.Token);
runtime.EnqueueEvent(CreateWorkerEvent(sequence: 7));
WorkerEnvelope workerEvent = await ReadUntilAsync(
pipePair.GatewayReader,
WorkerEnvelope.BodyOneofCase.WorkerEvent,
cancellation.Token);
Assert.Equal(MxEventFamily.OnDataChange, workerEvent.WorkerEvent.Event.Family);
Assert.Equal(7UL, workerEvent.WorkerEvent.Event.WorkerSequence);
await SendShutdownAndWaitAsync(pipePair, runTask, cancellation.Token);
}
[Fact]
public async Task RunAsync_WhenStaActivityIsStale_WritesWatchdogFault()
{
@@ -364,6 +395,20 @@ public sealed class WorkerPipeSessionTests
};
}
private static WorkerEvent CreateWorkerEvent(ulong sequence)
{
return new WorkerEvent
{
Event = new MxEvent
{
SessionId = SessionId,
Family = MxEventFamily.OnDataChange,
WorkerSequence = sequence,
OnDataChange = new OnDataChangeEvent(),
},
};
}
private static async Task CompleteGatewayHandshakeAsync(
PipePair pipePair,
CancellationToken cancellationToken)
@@ -478,6 +523,7 @@ public sealed class WorkerPipeSessionTests
{
private readonly ManualResetEventSlim releaseDispatch = new(false);
private readonly object gate = new();
private readonly Queue<WorkerEvent> events = new();
private WorkerRuntimeHeartbeatSnapshot snapshot = new(
DateTimeOffset.UtcNow,
pendingCommandCount: 0,
@@ -550,6 +596,33 @@ public sealed class WorkerPipeSessionTests
}
}
public IReadOnlyList<WorkerEvent> DrainEvents(uint maxEvents)
{
lock (gate)
{
int drainCount = maxEvents == 0
? events.Count
: Math.Min(events.Count, checked((int)Math.Min(maxEvents, int.MaxValue)));
List<WorkerEvent> drained = new(drainCount);
for (int index = 0; index < drainCount; index++)
{
drained.Add(events.Dequeue());
}
return drained;
}
}
public WorkerFault? DrainFault()
{
return null;
}
public bool CancelCommand(string correlationId)
{
return false;
}
public void RequestShutdown()
{
releaseDispatch.Set();
@@ -576,6 +649,14 @@ public sealed class WorkerPipeSessionTests
}
}
public void EnqueueEvent(WorkerEvent workerEvent)
{
lock (gate)
{
events.Enqueue(workerEvent);
}
}
public void Dispose()
{
releaseDispatch.Set();
+16 -4
View File
@@ -148,10 +148,22 @@ public sealed class WorkerPipeClient : IWorkerPipeClient
})
.Build();
return await pipeline.ExecuteAsync(
async token => await ConnectSingleAttemptAsync(pipeName, token).ConfigureAwait(false),
cancellationToken)
.ConfigureAwait(false);
using CancellationTokenSource connectDeadline =
CancellationTokenSource.CreateLinkedTokenSource(cancellationToken);
connectDeadline.CancelAfter(_connectTimeoutMilliseconds);
try
{
return await pipeline.ExecuteAsync(
async token => await ConnectSingleAttemptAsync(pipeName, token).ConfigureAwait(false),
connectDeadline.Token)
.ConfigureAwait(false);
}
catch (OperationCanceledException) when (!cancellationToken.IsCancellationRequested)
{
throw new TimeoutException(
$"Worker pipe {pipeName} did not connect within {_connectTimeoutMilliseconds}ms.");
}
}
private async Task<NamedPipeClientStream> ConnectSingleAttemptAsync(
+68 -1
View File
@@ -14,6 +14,9 @@ namespace MxGateway.Worker.Ipc;
public sealed class WorkerPipeSession
{
private static readonly TimeSpan EventDrainInterval = TimeSpan.FromMilliseconds(25);
private const uint EventDrainBatchSize = 128;
private readonly WorkerFrameProtocolOptions _options;
private readonly Func<int> _processIdProvider;
private readonly Func<IWorkerRuntimeSession> _runtimeSessionFactory;
@@ -206,17 +209,22 @@ public sealed class WorkerPipeSession
using CancellationTokenSource heartbeatCancellation = CancellationTokenSource
.CreateLinkedTokenSource(cancellationToken);
Task heartbeatTask = RunHeartbeatLoopAsync(heartbeatCancellation.Token);
Task eventDrainTask = RunEventDrainLoopAsync(heartbeatCancellation.Token);
try
{
while (!cancellationToken.IsCancellationRequested)
{
Task<WorkerEnvelope> readTask = _reader.ReadAsync(cancellationToken);
Task completedTask = await Task.WhenAny(readTask, heartbeatTask).ConfigureAwait(false);
Task completedTask = await Task.WhenAny(readTask, heartbeatTask, eventDrainTask).ConfigureAwait(false);
if (completedTask == heartbeatTask)
{
await heartbeatTask.ConfigureAwait(false);
}
else if (completedTask == eventDrainTask)
{
await eventDrainTask.ConfigureAwait(false);
}
WorkerEnvelope envelope = await readTask.ConfigureAwait(false);
bool keepReading = await DispatchGatewayEnvelopeAsync(envelope, cancellationToken).ConfigureAwait(false);
@@ -236,6 +244,52 @@ public sealed class WorkerPipeSession
catch (OperationCanceledException)
{
}
try
{
await eventDrainTask.ConfigureAwait(false);
}
catch (OperationCanceledException)
{
}
}
}
private async Task RunEventDrainLoopAsync(CancellationToken cancellationToken)
{
while (!cancellationToken.IsCancellationRequested)
{
IWorkerRuntimeSession? runtimeSession = _runtimeSession;
if (runtimeSession is null)
{
await Task.Delay(EventDrainInterval, cancellationToken).ConfigureAwait(false);
continue;
}
WorkerFault? fault = runtimeSession.DrainFault();
if (fault is not null)
{
_state = WorkerState.Faulted;
await TryWriteFaultAsync(fault, cancellationToken).ConfigureAwait(false);
throw new InvalidOperationException(
string.IsNullOrWhiteSpace(fault.DiagnosticMessage)
? $"MXAccess event queue faulted with category {fault.Category}."
: fault.DiagnosticMessage);
}
IReadOnlyList<WorkerEvent> events = runtimeSession.DrainEvents(EventDrainBatchSize);
if (events.Count == 0)
{
await Task.Delay(EventDrainInterval, cancellationToken).ConfigureAwait(false);
continue;
}
foreach (WorkerEvent workerEvent in events)
{
await _writer
.WriteAsync(CreateEnvelope(workerEvent), cancellationToken)
.ConfigureAwait(false);
}
}
}
@@ -252,6 +306,7 @@ public sealed class WorkerPipeSession
await ShutdownAsync(envelope.WorkerShutdown, cancellationToken).ConfigureAwait(false);
return false;
case WorkerEnvelope.BodyOneofCase.WorkerCancel:
_runtimeSession?.CancelCommand(envelope.CorrelationId);
return true;
default:
throw new WorkerFrameProtocolException(
@@ -461,6 +516,11 @@ public sealed class WorkerPipeSession
return CreateBaseEnvelope(reply);
}
private WorkerEnvelope CreateEnvelope(WorkerEvent workerEvent)
{
return CreateBaseEnvelope(workerEvent);
}
private WorkerEnvelope CreateEnvelope(WorkerShutdownAck shutdownAck)
{
return CreateBaseEnvelope(shutdownAck);
@@ -500,6 +560,13 @@ public sealed class WorkerPipeSession
return envelope;
}
private WorkerEnvelope CreateBaseEnvelope(WorkerEvent body)
{
WorkerEnvelope envelope = CreateBaseEnvelope();
envelope.WorkerEvent = body;
return envelope;
}
private WorkerEnvelope CreateBaseEnvelope(WorkerShutdownAck body)
{
WorkerEnvelope envelope = CreateBaseEnvelope();
@@ -1,4 +1,5 @@
using System;
using System.Collections.Generic;
using System.Threading;
using System.Threading.Tasks;
using MxGateway.Contracts.Proto;
@@ -17,6 +18,12 @@ public interface IWorkerRuntimeSession : IDisposable
WorkerRuntimeHeartbeatSnapshot CaptureHeartbeat();
IReadOnlyList<WorkerEvent> DrainEvents(uint maxEvents);
WorkerFault? DrainFault();
bool CancelCommand(string correlationId);
void RequestShutdown();
Task<MxAccessShutdownResult> ShutdownGracefullyAsync(
@@ -14,6 +14,7 @@ public sealed class MxAccessEventQueue
private readonly object syncRoot = new();
private ulong lastEventSequence;
private WorkerFault? fault;
private bool faultDrained;
public MxAccessEventQueue()
: this(DefaultCapacity)
@@ -163,6 +164,20 @@ public sealed class MxAccessEventQueue
}
}
public WorkerFault? DrainFault()
{
lock (syncRoot)
{
if (fault is null || faultDrained)
{
return null;
}
faultDrained = true;
return fault.Clone();
}
}
private WorkerFault CreateOverflowFault()
{
string message = $"MXAccess outbound event queue reached capacity {capacity}.";
@@ -79,7 +79,14 @@ public sealed class MxAccessSession : IDisposable
}
catch (Exception exception)
{
eventSink.Detach();
try
{
eventSink.Detach();
}
catch
{
// Preserve the creation failure while still releasing the COM object below.
}
if (mxAccessComObject is not null && Marshal.IsComObject(mxAccessComObject))
{
@@ -535,13 +542,15 @@ public sealed class MxAccessSession : IDisposable
private void DisposeCore(ICollection<MxAccessShutdownFailure>? failures)
{
Exception? detachException = null;
try
{
eventSink.Detach();
}
catch (Exception exception) when (failures is not null)
catch (Exception exception)
{
failures.Add(new MxAccessShutdownFailure(
detachException = exception;
failures?.Add(new MxAccessShutdownFailure(
"DetachEvents",
serverHandle: null,
itemHandle: null,
@@ -565,6 +574,10 @@ public sealed class MxAccessSession : IDisposable
}
disposed = true;
if (detachException is not null && failures is null)
{
throw detachException;
}
}
private void ThrowIfDisposed()
@@ -127,6 +127,16 @@ public sealed class MxAccessStaSession : IWorkerRuntimeSession
return eventQueue.Drain(maxEvents);
}
public WorkerFault? DrainFault()
{
return eventQueue.DrainFault();
}
public bool CancelCommand(string correlationId)
{
return commandDispatcher?.CancelQueuedCommand(correlationId) ?? false;
}
public Task<IReadOnlyList<RegisteredServerHandle>> GetRegisteredServerHandlesAsync(
CancellationToken cancellationToken = default)
{
@@ -207,7 +217,14 @@ public sealed class MxAccessStaSession : IWorkerRuntimeSession
throw new TimeoutException($"MXAccess graceful shutdown exceeded {timeout}.");
}
result = await cleanupTask.ConfigureAwait(false);
try
{
result = await cleanupTask.ConfigureAwait(false);
}
catch (OperationCanceledException) when (!cancellationToken.IsCancellationRequested)
{
throw new TimeoutException($"MXAccess graceful shutdown exceeded {timeout}.");
}
}
TimeSpan remaining = timeout - stopwatch.Elapsed;
@@ -232,7 +249,17 @@ public sealed class MxAccessStaSession : IWorkerRuntimeSession
if (session is not null)
{
staRuntime.InvokeAsync(() => session.Dispose()).GetAwaiter().GetResult();
try
{
staRuntime.InvokeAsync(() => session.Dispose())
.Wait(TimeSpan.FromSeconds(2));
}
catch (AggregateException)
{
}
catch (ObjectDisposedException)
{
}
}
staRuntime.Dispose();
@@ -8,10 +8,13 @@ namespace MxGateway.Worker.Sta;
public sealed class StaCommandDispatcher
{
public const int DefaultMaxPendingCommands = 128;
private readonly HResultConverter hresultConverter;
private readonly IStaCommandExecutor commandExecutor;
private readonly Queue<QueuedStaCommand> commandQueue = new();
private readonly StaRuntime staRuntime;
private readonly int maxPendingCommands;
private readonly object gate = new();
private bool drainActive;
private bool shutdownRequested;
@@ -28,10 +31,27 @@ public sealed class StaCommandDispatcher
StaRuntime staRuntime,
IStaCommandExecutor commandExecutor,
HResultConverter hresultConverter)
: this(staRuntime, commandExecutor, hresultConverter, DefaultMaxPendingCommands)
{
}
public StaCommandDispatcher(
StaRuntime staRuntime,
IStaCommandExecutor commandExecutor,
HResultConverter hresultConverter,
int maxPendingCommands)
{
if (maxPendingCommands <= 0)
{
throw new ArgumentOutOfRangeException(
nameof(maxPendingCommands),
"Max pending STA commands must be greater than zero.");
}
this.staRuntime = staRuntime ?? throw new ArgumentNullException(nameof(staRuntime));
this.commandExecutor = commandExecutor ?? throw new ArgumentNullException(nameof(commandExecutor));
this.hresultConverter = hresultConverter ?? throw new ArgumentNullException(nameof(hresultConverter));
this.maxPendingCommands = maxPendingCommands;
}
public int PendingCommandCount
@@ -73,6 +93,14 @@ public sealed class StaCommandDispatcher
"The STA command dispatcher is shutting down."));
}
if (commandQueue.Count >= maxPendingCommands)
{
return Task.FromResult(CreateRejectedReply(
command,
ProtocolStatusCode.WorkerUnavailable,
$"The STA command dispatcher already has {maxPendingCommands} pending command(s)."));
}
QueuedStaCommand queuedCommand = new(command);
commandQueue.Enqueue(queuedCommand);
@@ -86,6 +114,51 @@ public sealed class StaCommandDispatcher
}
}
public bool CancelQueuedCommand(string correlationId)
{
if (string.IsNullOrWhiteSpace(correlationId))
{
return false;
}
lock (gate)
{
if (commandQueue.Count == 0)
{
return false;
}
bool canceled = false;
Queue<QueuedStaCommand> retainedCommands = new(commandQueue.Count);
while (commandQueue.Count > 0)
{
QueuedStaCommand queuedCommand = commandQueue.Dequeue();
if (!canceled
&& string.Equals(
queuedCommand.Command.CorrelationId,
correlationId,
StringComparison.Ordinal))
{
queuedCommand.Complete(CreateRejectedReply(
queuedCommand.Command,
ProtocolStatusCode.Canceled,
"The STA command was canceled before execution."));
canceled = true;
continue;
}
retainedCommands.Enqueue(queuedCommand);
}
while (retainedCommands.Count > 0)
{
commandQueue.Enqueue(retainedCommands.Dequeue());
}
return canceled;
}
}
public void RequestShutdown()
{
lock (gate)