Improve gateway reliability and client e2e coverage
This commit is contained in:
@@ -2,5 +2,7 @@ namespace MxGateway.Server.Configuration;
|
||||
|
||||
public enum EventBackpressurePolicy
|
||||
{
|
||||
FailFast
|
||||
FailFast,
|
||||
|
||||
DisconnectSubscriber
|
||||
}
|
||||
|
||||
@@ -108,9 +108,19 @@ public sealed class EventStreamService(
|
||||
if (!writer.TryWrite(publicEvent))
|
||||
{
|
||||
string message = $"Session {session.SessionId} event stream queue overflowed.";
|
||||
session.MarkFaulted(message);
|
||||
metrics.QueueOverflow("grpc-event-stream");
|
||||
metrics.Fault(SessionManagerErrorCode.EventQueueOverflow.ToString());
|
||||
if (options.Value.Events.BackpressurePolicy == EventBackpressurePolicy.FailFast)
|
||||
{
|
||||
session.MarkFaulted(message);
|
||||
metrics.Fault(SessionManagerErrorCode.EventQueueOverflow.ToString());
|
||||
}
|
||||
else
|
||||
{
|
||||
logger.LogDebug(
|
||||
"Disconnecting event stream for session {SessionId} after queue overflow.",
|
||||
session.SessionId);
|
||||
}
|
||||
|
||||
writer.TryComplete(new SessionManagerException(
|
||||
SessionManagerErrorCode.EventQueueOverflow,
|
||||
message));
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
using System.Collections.Concurrent;
|
||||
using System.Diagnostics.Metrics;
|
||||
|
||||
namespace MxGateway.Server.Metrics;
|
||||
@@ -25,8 +26,8 @@ public sealed class GatewayMetrics : IDisposable
|
||||
private readonly Histogram<double> _commandLatencyHistogram;
|
||||
private readonly Histogram<double> _eventStreamSendLatencyHistogram;
|
||||
private readonly Dictionary<string, long> _commandFailuresByMethod = new(StringComparer.OrdinalIgnoreCase);
|
||||
private readonly Dictionary<string, long> _eventsByFamily = new(StringComparer.OrdinalIgnoreCase);
|
||||
private readonly Dictionary<string, long> _eventsBySession = new(StringComparer.Ordinal);
|
||||
private readonly ConcurrentDictionary<string, long> _eventsByFamily = new(StringComparer.OrdinalIgnoreCase);
|
||||
private readonly ConcurrentDictionary<string, long> _eventsBySession = new(StringComparer.Ordinal);
|
||||
private readonly Dictionary<string, long> _retryAttemptsByArea = new(StringComparer.OrdinalIgnoreCase);
|
||||
|
||||
private int _openSessions;
|
||||
@@ -173,12 +174,9 @@ public sealed class GatewayMetrics : IDisposable
|
||||
|
||||
public void EventReceived(string sessionId, string family)
|
||||
{
|
||||
lock (_syncRoot)
|
||||
{
|
||||
_eventsReceived++;
|
||||
Increment(_eventsByFamily, family);
|
||||
Increment(_eventsBySession, sessionId);
|
||||
}
|
||||
Interlocked.Increment(ref _eventsReceived);
|
||||
Increment(_eventsByFamily, family);
|
||||
Increment(_eventsBySession, sessionId);
|
||||
|
||||
_eventsReceivedCounter.Add(
|
||||
1,
|
||||
@@ -225,10 +223,7 @@ public sealed class GatewayMetrics : IDisposable
|
||||
|
||||
public void RemoveSessionEvents(string sessionId)
|
||||
{
|
||||
lock (_syncRoot)
|
||||
{
|
||||
_eventsBySession.Remove(sessionId);
|
||||
}
|
||||
_eventsBySession.TryRemove(sessionId, out _);
|
||||
}
|
||||
|
||||
public void QueueOverflow(string queueName)
|
||||
@@ -296,7 +291,7 @@ public sealed class GatewayMetrics : IDisposable
|
||||
CommandsStarted: _commandsStarted,
|
||||
CommandsSucceeded: _commandsSucceeded,
|
||||
CommandsFailed: _commandsFailed,
|
||||
EventsReceived: _eventsReceived,
|
||||
EventsReceived: Interlocked.Read(ref _eventsReceived),
|
||||
QueueOverflows: _queueOverflows,
|
||||
Faults: _faults,
|
||||
WorkerKills: _workerKills,
|
||||
@@ -359,4 +354,9 @@ public sealed class GatewayMetrics : IDisposable
|
||||
values.TryGetValue(key, out long currentValue);
|
||||
values[key] = currentValue + 1;
|
||||
}
|
||||
|
||||
private static void Increment(ConcurrentDictionary<string, long> values, string key)
|
||||
{
|
||||
values.AddOrUpdate(key, 1, static (_, currentValue) => currentValue + 1);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -41,6 +41,9 @@ public sealed class SessionWorkerClientFactory : ISessionWorkerClientFactory
|
||||
NamedPipeServerStream? pipe = CreatePipe(session.PipeName);
|
||||
WorkerProcessHandle? processHandle = null;
|
||||
IWorkerClient? workerClient = null;
|
||||
using CancellationTokenSource startupCancellation =
|
||||
CancellationTokenSource.CreateLinkedTokenSource(cancellationToken);
|
||||
startupCancellation.CancelAfter(session.StartupTimeout);
|
||||
try
|
||||
{
|
||||
session.TransitionTo(SessionState.StartingWorker);
|
||||
@@ -52,11 +55,11 @@ public sealed class SessionWorkerClientFactory : ISessionWorkerClientFactory
|
||||
GatewayContractInfo.WorkerProtocolVersion,
|
||||
session.Nonce,
|
||||
pipe),
|
||||
cancellationToken)
|
||||
startupCancellation.Token)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
session.TransitionTo(SessionState.WaitingForPipe);
|
||||
await WaitForPipeConnectionAsync(pipe, session.StartupTimeout, cancellationToken).ConfigureAwait(false);
|
||||
await WaitForPipeConnectionAsync(pipe, startupCancellation.Token).ConfigureAwait(false);
|
||||
|
||||
session.TransitionTo(SessionState.Handshaking);
|
||||
WorkerFrameProtocolOptions frameOptions = new(
|
||||
@@ -88,14 +91,23 @@ public sealed class SessionWorkerClientFactory : ISessionWorkerClientFactory
|
||||
processHandle = null;
|
||||
|
||||
session.TransitionTo(SessionState.InitializingWorker);
|
||||
await workerClient.StartAsync(cancellationToken).ConfigureAwait(false);
|
||||
await workerClient.StartAsync(startupCancellation.Token).ConfigureAwait(false);
|
||||
|
||||
return workerClient;
|
||||
}
|
||||
catch
|
||||
catch (Exception exception)
|
||||
{
|
||||
if (workerClient is not null)
|
||||
{
|
||||
try
|
||||
{
|
||||
workerClient.Kill("OpenSessionFailed");
|
||||
}
|
||||
catch
|
||||
{
|
||||
// Preserve the startup failure while still disposing below.
|
||||
}
|
||||
|
||||
await workerClient.DisposeAsync().ConfigureAwait(false);
|
||||
}
|
||||
else
|
||||
@@ -119,6 +131,15 @@ public sealed class SessionWorkerClientFactory : ISessionWorkerClientFactory
|
||||
pipe?.Dispose();
|
||||
}
|
||||
|
||||
if (exception is OperationCanceledException
|
||||
&& startupCancellation.IsCancellationRequested
|
||||
&& !cancellationToken.IsCancellationRequested)
|
||||
{
|
||||
throw new TimeoutException(
|
||||
$"Worker session {session.SessionId} did not complete startup within {session.StartupTimeout}.",
|
||||
exception);
|
||||
}
|
||||
|
||||
throw;
|
||||
}
|
||||
}
|
||||
@@ -135,11 +156,8 @@ public sealed class SessionWorkerClientFactory : ISessionWorkerClientFactory
|
||||
|
||||
private static async Task WaitForPipeConnectionAsync(
|
||||
NamedPipeServerStream pipe,
|
||||
TimeSpan startupTimeout,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
using CancellationTokenSource timeout = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken);
|
||||
timeout.CancelAfter(startupTimeout);
|
||||
await pipe.WaitForConnectionAsync(timeout.Token).ConfigureAwait(false);
|
||||
await pipe.WaitForConnectionAsync(cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -13,6 +13,7 @@ namespace MxGateway.Server.Workers;
|
||||
public sealed class WorkerClient : IWorkerClient
|
||||
{
|
||||
private const string GatewayVersionFallback = "unknown";
|
||||
private static readonly TimeSpan DisposeTaskTimeout = TimeSpan.FromSeconds(5);
|
||||
private readonly object _syncRoot = new();
|
||||
private readonly WorkerClientConnection _connection;
|
||||
private readonly WorkerClientOptions _options;
|
||||
@@ -286,8 +287,19 @@ public sealed class WorkerClient : IWorkerClient
|
||||
WorkerClientErrorCode.GatewayShutdown,
|
||||
"Worker client was disposed."));
|
||||
|
||||
await WaitForBackgroundTasksAsync(CancellationToken.None).ConfigureAwait(false);
|
||||
await _connection.Stream.DisposeAsync().ConfigureAwait(false);
|
||||
using CancellationTokenSource disposeTimeout = new(DisposeTaskTimeout);
|
||||
try
|
||||
{
|
||||
await WaitForBackgroundTasksAsync(disposeTimeout.Token).ConfigureAwait(false);
|
||||
}
|
||||
catch (OperationCanceledException)
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"Timed out waiting for worker client background tasks to stop for session {SessionId}.",
|
||||
SessionId);
|
||||
}
|
||||
|
||||
_connection.ProcessHandle?.Dispose();
|
||||
_pendingCommandSlots.Dispose();
|
||||
_stopCts.Dispose();
|
||||
|
||||
@@ -114,6 +114,37 @@ public sealed class EventStreamServiceTests
|
||||
Assert.Equal(1, metrics.GetSnapshot().Faults);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task StreamEventsAsync_WhenStreamQueueOverflowsWithDisconnectPolicy_LeavesSessionReady()
|
||||
{
|
||||
FakeWorkerClient workerClient = new();
|
||||
GatewaySession session = CreateReadySession(workerClient);
|
||||
using GatewayMetrics metrics = new();
|
||||
EventStreamService service = CreateService(
|
||||
new FakeSessionManager(session),
|
||||
metrics,
|
||||
queueCapacity: 1,
|
||||
backpressurePolicy: EventBackpressurePolicy.DisconnectSubscriber);
|
||||
workerClient.Events.Add(CreateWorkerEvent(sequence: 1, MxEventFamily.OnDataChange));
|
||||
workerClient.Events.Add(CreateWorkerEvent(sequence: 2, MxEventFamily.OnDataChange));
|
||||
workerClient.Events.Add(CreateWorkerEvent(sequence: 3, MxEventFamily.OnDataChange));
|
||||
workerClient.CompleteAfterConfiguredEvents = true;
|
||||
await using IAsyncEnumerator<MxEvent> subscriber = service
|
||||
.StreamEventsAsync(CreateRequest(session.SessionId), CancellationToken.None)
|
||||
.GetAsyncEnumerator();
|
||||
|
||||
Assert.True(await subscriber.MoveNextAsync().AsTask().WaitAsync(TestTimeout));
|
||||
SessionManagerException exception = await Assert.ThrowsAsync<SessionManagerException>(
|
||||
async () => await subscriber.MoveNextAsync().AsTask().WaitAsync(TestTimeout));
|
||||
|
||||
Assert.Equal(SessionManagerErrorCode.EventQueueOverflow, exception.ErrorCode);
|
||||
Assert.Equal(SessionState.Ready, session.State);
|
||||
GatewayMetricsSnapshot snapshot = metrics.GetSnapshot();
|
||||
Assert.Equal(1, snapshot.QueueOverflows);
|
||||
Assert.Equal(0, snapshot.Faults);
|
||||
Assert.Equal(1, snapshot.StreamDisconnects);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task StreamEventsAsync_DoesNotSynthesizeOperationComplete()
|
||||
{
|
||||
@@ -157,7 +188,8 @@ public sealed class EventStreamServiceTests
|
||||
private static EventStreamService CreateService(
|
||||
FakeSessionManager sessionManager,
|
||||
GatewayMetrics? metrics = null,
|
||||
int queueCapacity = 8)
|
||||
int queueCapacity = 8,
|
||||
EventBackpressurePolicy backpressurePolicy = EventBackpressurePolicy.FailFast)
|
||||
{
|
||||
return new EventStreamService(
|
||||
sessionManager,
|
||||
@@ -166,6 +198,7 @@ public sealed class EventStreamServiceTests
|
||||
Events = new EventOptions
|
||||
{
|
||||
QueueCapacity = queueCapacity,
|
||||
BackpressurePolicy = backpressurePolicy,
|
||||
},
|
||||
}),
|
||||
new MxAccessGrpcMapper(),
|
||||
|
||||
@@ -65,13 +65,33 @@ public sealed class SessionWorkerClientFactoryFakeWorkerTests
|
||||
Assert.True(launcher.Process.IsDisposed);
|
||||
}
|
||||
|
||||
private static GatewayOptions CreateOptions()
|
||||
[Fact]
|
||||
public async Task CreateAsync_WhenFakeWorkerNeverSendsReady_TimesOutAndKillsWorker()
|
||||
{
|
||||
NeverReadyWorkerProcessLauncher launcher = new();
|
||||
using GatewayMetrics metrics = new();
|
||||
SessionWorkerClientFactory factory = new(
|
||||
launcher,
|
||||
Options.Create(CreateOptions(startupTimeoutSeconds: 1)),
|
||||
metrics,
|
||||
NullLoggerFactory.Instance);
|
||||
GatewaySession session = CreateSession(startupTimeout: TimeSpan.FromSeconds(1));
|
||||
|
||||
TimeoutException exception = await Assert.ThrowsAsync<TimeoutException>(
|
||||
async () => await factory.CreateAsync(session, CancellationToken.None).WaitAsync(TestTimeout));
|
||||
|
||||
Assert.Contains("did not complete startup", exception.Message);
|
||||
Assert.Equal(1, launcher.Process.KillCount);
|
||||
Assert.True(launcher.Process.IsDisposed);
|
||||
}
|
||||
|
||||
private static GatewayOptions CreateOptions(int startupTimeoutSeconds = 5)
|
||||
{
|
||||
return new GatewayOptions
|
||||
{
|
||||
Worker = new WorkerOptions
|
||||
{
|
||||
StartupTimeoutSeconds = 5,
|
||||
StartupTimeoutSeconds = startupTimeoutSeconds,
|
||||
ShutdownTimeoutSeconds = 5,
|
||||
HeartbeatIntervalSeconds = 30,
|
||||
HeartbeatGraceSeconds = 30,
|
||||
@@ -84,7 +104,7 @@ public sealed class SessionWorkerClientFactoryFakeWorkerTests
|
||||
};
|
||||
}
|
||||
|
||||
private static GatewaySession CreateSession()
|
||||
private static GatewaySession CreateSession(TimeSpan? startupTimeout = null)
|
||||
{
|
||||
return new GatewaySession(
|
||||
FakeWorkerHarness.DefaultSessionId,
|
||||
@@ -94,7 +114,7 @@ public sealed class SessionWorkerClientFactoryFakeWorkerTests
|
||||
"test-client",
|
||||
"fake-worker-session-test",
|
||||
"client-correlation-1",
|
||||
TestTimeout,
|
||||
startupTimeout ?? TestTimeout,
|
||||
TestTimeout,
|
||||
TestTimeout,
|
||||
DateTimeOffset.UtcNow);
|
||||
@@ -172,6 +192,38 @@ public sealed class SessionWorkerClientFactoryFakeWorkerTests
|
||||
}
|
||||
}
|
||||
|
||||
private sealed class NeverReadyWorkerProcessLauncher : IWorkerProcessLauncher
|
||||
{
|
||||
public FakeWorkerProcess Process { get; } = new(processId: 4680);
|
||||
|
||||
public Task<WorkerProcessHandle> LaunchAsync(
|
||||
WorkerProcessLaunchRequest request,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
_ = RunWorkerAsync(request, cancellationToken);
|
||||
|
||||
return Task.FromResult(CreateHandle(Process));
|
||||
}
|
||||
|
||||
private async Task RunWorkerAsync(
|
||||
WorkerProcessLaunchRequest request,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
await using FakeWorkerHarness harness = await FakeWorkerHarness.ConnectToGatewayPipeAsync(
|
||||
request.SessionId,
|
||||
request.Nonce,
|
||||
request.PipeName,
|
||||
request.ProtocolVersion,
|
||||
cancellationToken: cancellationToken).ConfigureAwait(false);
|
||||
_ = await harness.ReadGatewayEnvelopeAsync(cancellationToken).ConfigureAwait(false);
|
||||
await harness.SendWorkerHelloAsync(
|
||||
workerProcessId: Process.Id,
|
||||
workerProtocolVersion: request.ProtocolVersion,
|
||||
cancellationToken: cancellationToken).ConfigureAwait(false);
|
||||
await Task.Delay(Timeout.InfiniteTimeSpan, cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
|
||||
private static WorkerProcessHandle CreateHandle(IWorkerProcess process)
|
||||
{
|
||||
return new WorkerProcessHandle(
|
||||
|
||||
@@ -166,7 +166,8 @@ public sealed class WorkerClientTests
|
||||
await pipePair.DisposeWorkerSideAsync();
|
||||
|
||||
await WaitUntilAsync(
|
||||
() => client.State == WorkerClientState.Faulted,
|
||||
() => client.State == WorkerClientState.Faulted
|
||||
&& metrics.GetSnapshot().WorkersRunning == 0,
|
||||
TestTimeout);
|
||||
|
||||
GatewayMetricsSnapshot snapshot = metrics.GetSnapshot();
|
||||
@@ -174,6 +175,22 @@ public sealed class WorkerClientTests
|
||||
Assert.Equal(1, snapshot.WorkerExits);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task DisposeAsync_WhenPipeReadIsBlocked_ReturnsWithinBoundedTimeout()
|
||||
{
|
||||
await using PipePair pipePair = await PipePair.CreateAsync();
|
||||
WorkerClient client = CreateClient(pipePair);
|
||||
await CompleteHandshakeAsync(client, pipePair);
|
||||
|
||||
DateTimeOffset startedAt = DateTimeOffset.UtcNow;
|
||||
await client.DisposeAsync().AsTask().WaitAsync(TestTimeout);
|
||||
TimeSpan elapsed = DateTimeOffset.UtcNow - startedAt;
|
||||
|
||||
Assert.True(
|
||||
elapsed < TimeSpan.FromSeconds(4),
|
||||
$"DisposeAsync took {elapsed.TotalMilliseconds:N0}ms.");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task ReadLoop_WhenHeartbeatArrives_UpdatesLastHeartbeatAndWorkerProcess()
|
||||
{
|
||||
|
||||
@@ -60,4 +60,22 @@ public sealed class GatewayMetricsTests
|
||||
|
||||
Assert.Equal("depth", exception.ParamName);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void RemoveSessionEvents_RemovesOnlyThatSession()
|
||||
{
|
||||
using GatewayMetrics metrics = new();
|
||||
|
||||
metrics.EventReceived("session-1", "OnDataChange");
|
||||
metrics.EventReceived("session-2", "OnWriteComplete");
|
||||
metrics.RemoveSessionEvents("session-1");
|
||||
|
||||
GatewayMetricsSnapshot snapshot = metrics.GetSnapshot();
|
||||
|
||||
Assert.Equal(2, snapshot.EventsReceived);
|
||||
Assert.False(snapshot.EventsBySession.ContainsKey("session-1"));
|
||||
Assert.Equal(1, snapshot.EventsBySession["session-2"]);
|
||||
Assert.Equal(1, snapshot.EventsByFamily["OnDataChange"]);
|
||||
Assert.Equal(1, snapshot.EventsByFamily["OnWriteComplete"]);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -304,6 +304,45 @@ public sealed class WorkerPipeSessionTests
|
||||
await SendShutdownAndWaitAsync(pipePair, runTask, cancellation.Token);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task RunAsync_WhenShutdownArrivesDuringCommand_DropsLateReplyAndWritesShutdownAck()
|
||||
{
|
||||
using CancellationTokenSource cancellation = new(TimeSpan.FromSeconds(5));
|
||||
using PipePair pipePair = await PipePair.CreateAsync(cancellation.Token);
|
||||
FakeRuntimeSession runtime = new()
|
||||
{
|
||||
BlockDispatch = true,
|
||||
};
|
||||
WorkerPipeSession session = CreatePipeSession(
|
||||
pipePair.WorkerStream,
|
||||
runtime,
|
||||
new WorkerPipeSessionOptions
|
||||
{
|
||||
HeartbeatInterval = TimeSpan.FromSeconds(1),
|
||||
HeartbeatGrace = TimeSpan.FromSeconds(5),
|
||||
});
|
||||
Task runTask = session.RunAsync(cancellation.Token);
|
||||
await CompleteGatewayHandshakeAsync(pipePair, cancellation.Token);
|
||||
|
||||
await pipePair.GatewayWriter.WriteAsync(
|
||||
CreateCommandEnvelope("command-during-shutdown"),
|
||||
cancellation.Token);
|
||||
Assert.True(runtime.DispatchStarted.Wait(TimeSpan.FromSeconds(2)));
|
||||
|
||||
await pipePair.GatewayWriter
|
||||
.WriteAsync(CreateShutdownEnvelope(), cancellation.Token);
|
||||
|
||||
WorkerEnvelope shutdownAck = await ReadUntilAsync(
|
||||
pipePair.GatewayReader,
|
||||
WorkerEnvelope.BodyOneofCase.WorkerShutdownAck,
|
||||
cancellation.Token);
|
||||
|
||||
Assert.Equal(ProtocolStatusCode.Ok, shutdownAck.WorkerShutdownAck.Status.Code);
|
||||
Task completedTask = await Task.WhenAny(runTask, Task.Delay(TimeSpan.FromSeconds(2), cancellation.Token));
|
||||
Assert.Same(runTask, completedTask);
|
||||
await runTask;
|
||||
}
|
||||
|
||||
private static WorkerPipeSession CreateSession(
|
||||
Stream inbound,
|
||||
Stream outbound,
|
||||
@@ -440,7 +479,7 @@ public sealed class WorkerPipeSessionTests
|
||||
|
||||
Assert.Equal(ProtocolStatusCode.Ok, shutdownAck.WorkerShutdownAck.Status.Code);
|
||||
Task completedTask = await Task
|
||||
.WhenAny(runTask, Task.Delay(TimeSpan.FromSeconds(2), cancellationToken))
|
||||
.WhenAny(runTask, Task.Delay(TimeSpan.FromSeconds(5), cancellationToken))
|
||||
.ConfigureAwait(false);
|
||||
|
||||
Assert.Same(runTask, completedTask);
|
||||
|
||||
@@ -12,17 +12,17 @@ public sealed class MxAccessEventQueueTests
|
||||
{
|
||||
MxAccessEventQueue queue = new(capacity: 4);
|
||||
|
||||
WorkerEvent first = queue.Enqueue(CreateEvent(MxEventFamily.OnDataChange, itemHandle: 10));
|
||||
WorkerEvent second = queue.Enqueue(CreateEvent(MxEventFamily.OnWriteComplete, itemHandle: 11));
|
||||
queue.Enqueue(CreateEvent(MxEventFamily.OnDataChange, itemHandle: 10));
|
||||
queue.Enqueue(CreateEvent(MxEventFamily.OnWriteComplete, itemHandle: 11));
|
||||
|
||||
Assert.Equal(1UL, first.Event.WorkerSequence);
|
||||
Assert.Equal(2UL, second.Event.WorkerSequence);
|
||||
Assert.NotNull(first.Event.WorkerTimestamp);
|
||||
Assert.Equal(2, queue.Count);
|
||||
Assert.Equal(2UL, queue.LastEventSequence);
|
||||
|
||||
Assert.True(queue.TryDequeue(out WorkerEvent? dequeuedFirst));
|
||||
Assert.True(queue.TryDequeue(out WorkerEvent? dequeuedSecond));
|
||||
Assert.Equal(1UL, dequeuedFirst?.Event.WorkerSequence);
|
||||
Assert.Equal(2UL, dequeuedSecond?.Event.WorkerSequence);
|
||||
Assert.NotNull(dequeuedFirst?.Event.WorkerTimestamp);
|
||||
Assert.Equal(10, dequeuedFirst?.Event.ItemHandle);
|
||||
Assert.Equal(11, dequeuedSecond?.Event.ItemHandle);
|
||||
Assert.False(queue.TryDequeue(out _));
|
||||
|
||||
@@ -15,6 +15,7 @@ namespace MxGateway.Worker.Ipc;
|
||||
public sealed class WorkerPipeSession
|
||||
{
|
||||
private static readonly TimeSpan EventDrainInterval = TimeSpan.FromMilliseconds(25);
|
||||
private static readonly TimeSpan BackgroundTaskStopTimeout = TimeSpan.FromSeconds(1);
|
||||
private const uint EventDrainBatchSize = 128;
|
||||
|
||||
private readonly WorkerFrameProtocolOptions _options;
|
||||
@@ -24,9 +25,12 @@ public sealed class WorkerPipeSession
|
||||
private readonly IWorkerLogger? _logger;
|
||||
private readonly WorkerFrameReader _reader;
|
||||
private readonly WorkerFrameWriter _writer;
|
||||
private readonly object _commandTaskGate = new();
|
||||
private readonly HashSet<Task> _activeCommandTasks = new();
|
||||
private IWorkerRuntimeSession? _runtimeSession;
|
||||
private long _nextSequence;
|
||||
private WorkerState _state = WorkerState.Starting;
|
||||
private bool _acceptingCommands = true;
|
||||
private bool _watchdogFaultSent;
|
||||
private bool _shutdownTimedOut;
|
||||
|
||||
@@ -206,18 +210,31 @@ public sealed class WorkerPipeSession
|
||||
|
||||
private async Task RunMessageLoopAsync(CancellationToken cancellationToken)
|
||||
{
|
||||
using CancellationTokenSource loopCancellation = CancellationTokenSource
|
||||
.CreateLinkedTokenSource(cancellationToken);
|
||||
using CancellationTokenSource heartbeatCancellation = CancellationTokenSource
|
||||
.CreateLinkedTokenSource(cancellationToken);
|
||||
Task heartbeatTask = RunHeartbeatLoopAsync(heartbeatCancellation.Token);
|
||||
Task eventDrainTask = RunEventDrainLoopAsync(heartbeatCancellation.Token);
|
||||
Task<WorkerEnvelope> readTask = _reader.ReadAsync(loopCancellation.Token);
|
||||
|
||||
try
|
||||
{
|
||||
while (!cancellationToken.IsCancellationRequested)
|
||||
{
|
||||
Task<WorkerEnvelope> readTask = _reader.ReadAsync(cancellationToken);
|
||||
Task completedTask = await Task.WhenAny(readTask, heartbeatTask, eventDrainTask).ConfigureAwait(false);
|
||||
if (completedTask == heartbeatTask)
|
||||
if (completedTask == readTask)
|
||||
{
|
||||
WorkerEnvelope envelope = await readTask.ConfigureAwait(false);
|
||||
bool keepReading = await DispatchGatewayEnvelopeAsync(envelope, cancellationToken).ConfigureAwait(false);
|
||||
if (!keepReading)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
readTask = _reader.ReadAsync(loopCancellation.Token);
|
||||
}
|
||||
else if (completedTask == heartbeatTask)
|
||||
{
|
||||
await heartbeatTask.ConfigureAwait(false);
|
||||
}
|
||||
@@ -225,33 +242,52 @@ public sealed class WorkerPipeSession
|
||||
{
|
||||
await eventDrainTask.ConfigureAwait(false);
|
||||
}
|
||||
|
||||
WorkerEnvelope envelope = await readTask.ConfigureAwait(false);
|
||||
bool keepReading = await DispatchGatewayEnvelopeAsync(envelope, cancellationToken).ConfigureAwait(false);
|
||||
if (!keepReading)
|
||||
{
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
finally
|
||||
{
|
||||
loopCancellation.Cancel();
|
||||
heartbeatCancellation.Cancel();
|
||||
try
|
||||
{
|
||||
await heartbeatTask.ConfigureAwait(false);
|
||||
}
|
||||
catch (OperationCanceledException)
|
||||
{
|
||||
}
|
||||
await ObserveBackgroundTaskStopAsync(heartbeatTask, "Heartbeat").ConfigureAwait(false);
|
||||
await ObserveBackgroundTaskStopAsync(eventDrainTask, "EventDrain").ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
await eventDrainTask.ConfigureAwait(false);
|
||||
}
|
||||
catch (OperationCanceledException)
|
||||
{
|
||||
}
|
||||
private async Task ObserveBackgroundTaskStopAsync(
|
||||
Task task,
|
||||
string taskName)
|
||||
{
|
||||
Task completedTask = await Task
|
||||
.WhenAny(task, Task.Delay(BackgroundTaskStopTimeout))
|
||||
.ConfigureAwait(false);
|
||||
if (completedTask != task)
|
||||
{
|
||||
_logger?.Error(
|
||||
"WorkerPipeSessionBackgroundTaskStopTimedOut",
|
||||
new Dictionary<string, object?>
|
||||
{
|
||||
["task"] = taskName,
|
||||
["timeout_ms"] = BackgroundTaskStopTimeout.TotalMilliseconds,
|
||||
});
|
||||
return;
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
await task.ConfigureAwait(false);
|
||||
}
|
||||
catch (OperationCanceledException)
|
||||
{
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger?.Error(
|
||||
"WorkerPipeSessionBackgroundTaskStopFailed",
|
||||
new Dictionary<string, object?>
|
||||
{
|
||||
["task"] = taskName,
|
||||
["exception"] = ex.ToString(),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -300,7 +336,7 @@ public sealed class WorkerPipeSession
|
||||
switch (envelope.BodyCase)
|
||||
{
|
||||
case WorkerEnvelope.BodyOneofCase.WorkerCommand:
|
||||
_ = ProcessCommandAsync(envelope, cancellationToken);
|
||||
TryStartCommandTask(envelope, cancellationToken);
|
||||
return true;
|
||||
case WorkerEnvelope.BodyOneofCase.WorkerShutdown:
|
||||
await ShutdownAsync(envelope.WorkerShutdown, cancellationToken).ConfigureAwait(false);
|
||||
@@ -333,6 +369,11 @@ public sealed class WorkerPipeSession
|
||||
try
|
||||
{
|
||||
MxCommandReply reply = await runtimeSession.DispatchAsync(staCommand).ConfigureAwait(false);
|
||||
if (_state is not WorkerState.Ready and not WorkerState.ExecutingCommand)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
await _writer
|
||||
.WriteAsync(
|
||||
CreateEnvelope(new WorkerCommandReply
|
||||
@@ -370,11 +411,13 @@ public sealed class WorkerPipeSession
|
||||
}
|
||||
|
||||
TimeSpan gracePeriod = ResolveGracePeriod(shutdown);
|
||||
StopAcceptingCommands();
|
||||
try
|
||||
{
|
||||
MxAccessShutdownResult result = await runtimeSession
|
||||
.ShutdownGracefullyAsync(gracePeriod, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
await WaitForActiveCommandTasksAsync(gracePeriod, cancellationToken).ConfigureAwait(false);
|
||||
LogShutdownFailures(result.Failures);
|
||||
await WriteShutdownAckAsync(CreateShutdownAck(result, shutdown), cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
@@ -387,6 +430,79 @@ public sealed class WorkerPipeSession
|
||||
}
|
||||
}
|
||||
|
||||
private void TryStartCommandTask(
|
||||
WorkerEnvelope envelope,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
Task commandTask;
|
||||
lock (_commandTaskGate)
|
||||
{
|
||||
if (!_acceptingCommands)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
commandTask = ProcessCommandAsync(envelope, cancellationToken);
|
||||
_activeCommandTasks.Add(commandTask);
|
||||
}
|
||||
|
||||
_ = ObserveCommandTaskAsync(commandTask);
|
||||
}
|
||||
|
||||
private async Task ObserveCommandTaskAsync(Task commandTask)
|
||||
{
|
||||
try
|
||||
{
|
||||
await commandTask.ConfigureAwait(false);
|
||||
}
|
||||
catch (OperationCanceledException)
|
||||
{
|
||||
}
|
||||
finally
|
||||
{
|
||||
lock (_commandTaskGate)
|
||||
{
|
||||
_activeCommandTasks.Remove(commandTask);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void StopAcceptingCommands()
|
||||
{
|
||||
lock (_commandTaskGate)
|
||||
{
|
||||
_acceptingCommands = false;
|
||||
}
|
||||
}
|
||||
|
||||
private async Task WaitForActiveCommandTasksAsync(
|
||||
TimeSpan timeout,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
Task[] activeTasks;
|
||||
lock (_commandTaskGate)
|
||||
{
|
||||
activeTasks = new List<Task>(_activeCommandTasks).ToArray();
|
||||
}
|
||||
|
||||
if (activeTasks.Length == 0)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
Task activeCommandsTask = Task.WhenAll(activeTasks);
|
||||
Task timeoutTask = Task.Delay(timeout, cancellationToken);
|
||||
Task completedTask = await Task.WhenAny(activeCommandsTask, timeoutTask).ConfigureAwait(false);
|
||||
if (completedTask == activeCommandsTask)
|
||||
{
|
||||
await activeCommandsTask.ConfigureAwait(false);
|
||||
return;
|
||||
}
|
||||
|
||||
cancellationToken.ThrowIfCancellationRequested();
|
||||
throw new TimeoutException($"Worker command tasks did not stop within {timeout}.");
|
||||
}
|
||||
|
||||
private Task WriteShutdownAckAsync(
|
||||
WorkerShutdownAck shutdownAck,
|
||||
CancellationToken cancellationToken)
|
||||
|
||||
@@ -80,7 +80,7 @@ public sealed class MxAccessEventQueue
|
||||
}
|
||||
}
|
||||
|
||||
public WorkerEvent Enqueue(MxEvent mxEvent)
|
||||
public void Enqueue(MxEvent mxEvent)
|
||||
{
|
||||
if (mxEvent is null)
|
||||
{
|
||||
@@ -109,8 +109,6 @@ public sealed class MxAccessEventQueue
|
||||
Event = queuedEvent,
|
||||
};
|
||||
events.Enqueue(workerEvent);
|
||||
|
||||
return workerEvent.Clone();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -124,7 +122,7 @@ public sealed class MxAccessEventQueue
|
||||
return false;
|
||||
}
|
||||
|
||||
workerEvent = events.Dequeue().Clone();
|
||||
workerEvent = events.Dequeue();
|
||||
return true;
|
||||
}
|
||||
}
|
||||
@@ -144,7 +142,7 @@ public sealed class MxAccessEventQueue
|
||||
List<WorkerEvent> drained = new(drainCount);
|
||||
for (int index = 0; index < drainCount; index++)
|
||||
{
|
||||
drained.Add(events.Dequeue().Clone());
|
||||
drained.Add(events.Dequeue());
|
||||
}
|
||||
|
||||
return drained;
|
||||
|
||||
Reference in New Issue
Block a user