Improve gateway reliability and client e2e coverage

This commit is contained in:
Joseph Doherty
2026-04-28 06:11:18 -04:00
parent 4fc355b357
commit 907aa49aea
25 changed files with 1153 additions and 83 deletions
+140 -24
View File
@@ -15,6 +15,7 @@ namespace MxGateway.Worker.Ipc;
public sealed class WorkerPipeSession
{
private static readonly TimeSpan EventDrainInterval = TimeSpan.FromMilliseconds(25);
private static readonly TimeSpan BackgroundTaskStopTimeout = TimeSpan.FromSeconds(1);
private const uint EventDrainBatchSize = 128;
private readonly WorkerFrameProtocolOptions _options;
@@ -24,9 +25,12 @@ public sealed class WorkerPipeSession
private readonly IWorkerLogger? _logger;
private readonly WorkerFrameReader _reader;
private readonly WorkerFrameWriter _writer;
private readonly object _commandTaskGate = new();
private readonly HashSet<Task> _activeCommandTasks = new();
private IWorkerRuntimeSession? _runtimeSession;
private long _nextSequence;
private WorkerState _state = WorkerState.Starting;
private bool _acceptingCommands = true;
private bool _watchdogFaultSent;
private bool _shutdownTimedOut;
@@ -206,18 +210,31 @@ public sealed class WorkerPipeSession
private async Task RunMessageLoopAsync(CancellationToken cancellationToken)
{
using CancellationTokenSource loopCancellation = CancellationTokenSource
.CreateLinkedTokenSource(cancellationToken);
using CancellationTokenSource heartbeatCancellation = CancellationTokenSource
.CreateLinkedTokenSource(cancellationToken);
Task heartbeatTask = RunHeartbeatLoopAsync(heartbeatCancellation.Token);
Task eventDrainTask = RunEventDrainLoopAsync(heartbeatCancellation.Token);
Task<WorkerEnvelope> readTask = _reader.ReadAsync(loopCancellation.Token);
try
{
while (!cancellationToken.IsCancellationRequested)
{
Task<WorkerEnvelope> readTask = _reader.ReadAsync(cancellationToken);
Task completedTask = await Task.WhenAny(readTask, heartbeatTask, eventDrainTask).ConfigureAwait(false);
if (completedTask == heartbeatTask)
if (completedTask == readTask)
{
WorkerEnvelope envelope = await readTask.ConfigureAwait(false);
bool keepReading = await DispatchGatewayEnvelopeAsync(envelope, cancellationToken).ConfigureAwait(false);
if (!keepReading)
{
return;
}
readTask = _reader.ReadAsync(loopCancellation.Token);
}
else if (completedTask == heartbeatTask)
{
await heartbeatTask.ConfigureAwait(false);
}
@@ -225,33 +242,52 @@ public sealed class WorkerPipeSession
{
await eventDrainTask.ConfigureAwait(false);
}
WorkerEnvelope envelope = await readTask.ConfigureAwait(false);
bool keepReading = await DispatchGatewayEnvelopeAsync(envelope, cancellationToken).ConfigureAwait(false);
if (!keepReading)
{
return;
}
}
}
finally
{
loopCancellation.Cancel();
heartbeatCancellation.Cancel();
try
{
await heartbeatTask.ConfigureAwait(false);
}
catch (OperationCanceledException)
{
}
await ObserveBackgroundTaskStopAsync(heartbeatTask, "Heartbeat").ConfigureAwait(false);
await ObserveBackgroundTaskStopAsync(eventDrainTask, "EventDrain").ConfigureAwait(false);
}
}
try
{
await eventDrainTask.ConfigureAwait(false);
}
catch (OperationCanceledException)
{
}
private async Task ObserveBackgroundTaskStopAsync(
Task task,
string taskName)
{
Task completedTask = await Task
.WhenAny(task, Task.Delay(BackgroundTaskStopTimeout))
.ConfigureAwait(false);
if (completedTask != task)
{
_logger?.Error(
"WorkerPipeSessionBackgroundTaskStopTimedOut",
new Dictionary<string, object?>
{
["task"] = taskName,
["timeout_ms"] = BackgroundTaskStopTimeout.TotalMilliseconds,
});
return;
}
try
{
await task.ConfigureAwait(false);
}
catch (OperationCanceledException)
{
}
catch (Exception ex)
{
_logger?.Error(
"WorkerPipeSessionBackgroundTaskStopFailed",
new Dictionary<string, object?>
{
["task"] = taskName,
["exception"] = ex.ToString(),
});
}
}
@@ -300,7 +336,7 @@ public sealed class WorkerPipeSession
switch (envelope.BodyCase)
{
case WorkerEnvelope.BodyOneofCase.WorkerCommand:
_ = ProcessCommandAsync(envelope, cancellationToken);
TryStartCommandTask(envelope, cancellationToken);
return true;
case WorkerEnvelope.BodyOneofCase.WorkerShutdown:
await ShutdownAsync(envelope.WorkerShutdown, cancellationToken).ConfigureAwait(false);
@@ -333,6 +369,11 @@ public sealed class WorkerPipeSession
try
{
MxCommandReply reply = await runtimeSession.DispatchAsync(staCommand).ConfigureAwait(false);
if (_state is not WorkerState.Ready and not WorkerState.ExecutingCommand)
{
return;
}
await _writer
.WriteAsync(
CreateEnvelope(new WorkerCommandReply
@@ -370,11 +411,13 @@ public sealed class WorkerPipeSession
}
TimeSpan gracePeriod = ResolveGracePeriod(shutdown);
StopAcceptingCommands();
try
{
MxAccessShutdownResult result = await runtimeSession
.ShutdownGracefullyAsync(gracePeriod, cancellationToken)
.ConfigureAwait(false);
await WaitForActiveCommandTasksAsync(gracePeriod, cancellationToken).ConfigureAwait(false);
LogShutdownFailures(result.Failures);
await WriteShutdownAckAsync(CreateShutdownAck(result, shutdown), cancellationToken).ConfigureAwait(false);
}
@@ -387,6 +430,79 @@ public sealed class WorkerPipeSession
}
}
private void TryStartCommandTask(
WorkerEnvelope envelope,
CancellationToken cancellationToken)
{
Task commandTask;
lock (_commandTaskGate)
{
if (!_acceptingCommands)
{
return;
}
commandTask = ProcessCommandAsync(envelope, cancellationToken);
_activeCommandTasks.Add(commandTask);
}
_ = ObserveCommandTaskAsync(commandTask);
}
private async Task ObserveCommandTaskAsync(Task commandTask)
{
try
{
await commandTask.ConfigureAwait(false);
}
catch (OperationCanceledException)
{
}
finally
{
lock (_commandTaskGate)
{
_activeCommandTasks.Remove(commandTask);
}
}
}
private void StopAcceptingCommands()
{
lock (_commandTaskGate)
{
_acceptingCommands = false;
}
}
private async Task WaitForActiveCommandTasksAsync(
TimeSpan timeout,
CancellationToken cancellationToken)
{
Task[] activeTasks;
lock (_commandTaskGate)
{
activeTasks = new List<Task>(_activeCommandTasks).ToArray();
}
if (activeTasks.Length == 0)
{
return;
}
Task activeCommandsTask = Task.WhenAll(activeTasks);
Task timeoutTask = Task.Delay(timeout, cancellationToken);
Task completedTask = await Task.WhenAny(activeCommandsTask, timeoutTask).ConfigureAwait(false);
if (completedTask == activeCommandsTask)
{
await activeCommandsTask.ConfigureAwait(false);
return;
}
cancellationToken.ThrowIfCancellationRequested();
throw new TimeoutException($"Worker command tasks did not stop within {timeout}.");
}
private Task WriteShutdownAckAsync(
WorkerShutdownAck shutdownAck,
CancellationToken cancellationToken)
@@ -80,7 +80,7 @@ public sealed class MxAccessEventQueue
}
}
public WorkerEvent Enqueue(MxEvent mxEvent)
public void Enqueue(MxEvent mxEvent)
{
if (mxEvent is null)
{
@@ -109,8 +109,6 @@ public sealed class MxAccessEventQueue
Event = queuedEvent,
};
events.Enqueue(workerEvent);
return workerEvent.Clone();
}
}
@@ -124,7 +122,7 @@ public sealed class MxAccessEventQueue
return false;
}
workerEvent = events.Dequeue().Clone();
workerEvent = events.Dequeue();
return true;
}
}
@@ -144,7 +142,7 @@ public sealed class MxAccessEventQueue
List<WorkerEvent> drained = new(drainCount);
for (int index = 0; index < drainCount; index++)
{
drained.Add(events.Dequeue().Clone());
drained.Add(events.Dequeue());
}
return drained;