Implement graceful worker shutdown
This commit is contained in:
@@ -11,13 +11,26 @@ public sealed class WorkerPipeClient : IWorkerPipeClient
|
||||
public const int DefaultConnectTimeoutMilliseconds = 30000;
|
||||
|
||||
private readonly int _connectTimeoutMilliseconds;
|
||||
private readonly IWorkerLogger? _logger;
|
||||
|
||||
public WorkerPipeClient()
|
||||
: this(DefaultConnectTimeoutMilliseconds)
|
||||
: this(null, DefaultConnectTimeoutMilliseconds)
|
||||
{
|
||||
}
|
||||
|
||||
public WorkerPipeClient(IWorkerLogger? logger)
|
||||
: this(logger, DefaultConnectTimeoutMilliseconds)
|
||||
{
|
||||
}
|
||||
|
||||
public WorkerPipeClient(int connectTimeoutMilliseconds)
|
||||
: this(null, connectTimeoutMilliseconds)
|
||||
{
|
||||
}
|
||||
|
||||
public WorkerPipeClient(
|
||||
IWorkerLogger? logger,
|
||||
int connectTimeoutMilliseconds)
|
||||
{
|
||||
if (connectTimeoutMilliseconds <= 0)
|
||||
{
|
||||
@@ -27,6 +40,7 @@ public sealed class WorkerPipeClient : IWorkerPipeClient
|
||||
}
|
||||
|
||||
_connectTimeoutMilliseconds = connectTimeoutMilliseconds;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
public async Task RunAsync(
|
||||
@@ -48,8 +62,9 @@ public sealed class WorkerPipeClient : IWorkerPipeClient
|
||||
|
||||
await ConnectAsync(pipe, cancellationToken).ConfigureAwait(false);
|
||||
|
||||
WorkerPipeSession session = new(pipe, frameOptions);
|
||||
WorkerPipeSession session = new(pipe, frameOptions, _logger);
|
||||
await session.CompleteStartupHandshakeAsync(cancellationToken).ConfigureAwait(false);
|
||||
await session.RunAsync(cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
private Task ConnectAsync(
|
||||
|
||||
@@ -1,11 +1,14 @@
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Diagnostics;
|
||||
using System.IO;
|
||||
using System.Threading;
|
||||
using System.Threading.Tasks;
|
||||
using Google.Protobuf.WellKnownTypes;
|
||||
using MxGateway.Contracts.Proto;
|
||||
using MxGateway.Worker.Bootstrap;
|
||||
using MxGateway.Worker.MxAccess;
|
||||
using MxGateway.Worker.Sta;
|
||||
|
||||
namespace MxGateway.Worker.Ipc;
|
||||
|
||||
@@ -13,19 +16,24 @@ public sealed class WorkerPipeSession
|
||||
{
|
||||
private readonly WorkerFrameProtocolOptions _options;
|
||||
private readonly Func<int> _processIdProvider;
|
||||
private readonly IWorkerLogger? _logger;
|
||||
private readonly WorkerFrameReader _reader;
|
||||
private readonly WorkerFrameWriter _writer;
|
||||
private MxAccessStaSession? _mxAccessStaSession;
|
||||
private long _nextSequence;
|
||||
private bool _shutdownCompleted;
|
||||
private bool _shutdownTimedOut;
|
||||
|
||||
public WorkerPipeSession(
|
||||
Stream stream,
|
||||
WorkerFrameProtocolOptions options)
|
||||
WorkerFrameProtocolOptions options,
|
||||
IWorkerLogger? logger = null)
|
||||
: this(
|
||||
new WorkerFrameReader(stream, options),
|
||||
new WorkerFrameWriter(stream, options),
|
||||
options,
|
||||
() => Process.GetCurrentProcess().Id)
|
||||
() => Process.GetCurrentProcess().Id,
|
||||
logger)
|
||||
{
|
||||
}
|
||||
|
||||
@@ -33,12 +41,14 @@ public sealed class WorkerPipeSession
|
||||
WorkerFrameReader reader,
|
||||
WorkerFrameWriter writer,
|
||||
WorkerFrameProtocolOptions options,
|
||||
Func<int> processIdProvider)
|
||||
Func<int> processIdProvider,
|
||||
IWorkerLogger? logger = null)
|
||||
{
|
||||
_reader = reader ?? throw new ArgumentNullException(nameof(reader));
|
||||
_writer = writer ?? throw new ArgumentNullException(nameof(writer));
|
||||
_options = options ?? throw new ArgumentNullException(nameof(options));
|
||||
_processIdProvider = processIdProvider ?? throw new ArgumentNullException(nameof(processIdProvider));
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
public Task CompleteStartupHandshakeAsync(CancellationToken cancellationToken = default)
|
||||
@@ -95,6 +105,44 @@ public sealed class WorkerPipeSession
|
||||
}
|
||||
}
|
||||
|
||||
public async Task RunAsync(CancellationToken cancellationToken = default)
|
||||
{
|
||||
try
|
||||
{
|
||||
while (true)
|
||||
{
|
||||
WorkerEnvelope envelope = await _reader.ReadAsync(cancellationToken).ConfigureAwait(false);
|
||||
switch (envelope.BodyCase)
|
||||
{
|
||||
case WorkerEnvelope.BodyOneofCase.WorkerCommand:
|
||||
await HandleCommandAsync(envelope, cancellationToken).ConfigureAwait(false);
|
||||
break;
|
||||
case WorkerEnvelope.BodyOneofCase.WorkerShutdown:
|
||||
await HandleShutdownAsync(envelope.WorkerShutdown, cancellationToken).ConfigureAwait(false);
|
||||
return;
|
||||
case WorkerEnvelope.BodyOneofCase.WorkerCancel:
|
||||
break;
|
||||
default:
|
||||
throw new WorkerFrameProtocolException(
|
||||
WorkerFrameProtocolErrorCode.UnexpectedEnvelopeBody,
|
||||
$"Worker received unexpected gateway envelope body {envelope.BodyCase} after startup.");
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (WorkerFrameProtocolException exception)
|
||||
{
|
||||
await TryWriteFaultAsync(exception, cancellationToken).ConfigureAwait(false);
|
||||
throw;
|
||||
}
|
||||
finally
|
||||
{
|
||||
if (!_shutdownCompleted && !_shutdownTimedOut)
|
||||
{
|
||||
_mxAccessStaSession?.Dispose();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void ValidateGatewayHello(WorkerEnvelope envelope)
|
||||
{
|
||||
if (envelope.BodyCase != WorkerEnvelope.BodyOneofCase.GatewayHello)
|
||||
@@ -178,6 +226,25 @@ public sealed class WorkerPipeSession
|
||||
}
|
||||
}
|
||||
|
||||
private async Task TryWriteFaultAsync(
|
||||
WorkerFault fault,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
try
|
||||
{
|
||||
await _writer
|
||||
.WriteAsync(CreateEnvelope(fault), cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
}
|
||||
catch (Exception faultWriteException) when (
|
||||
faultWriteException is IOException
|
||||
|| faultWriteException is ObjectDisposedException
|
||||
|| faultWriteException is WorkerFrameProtocolException)
|
||||
{
|
||||
// The shutdown timeout is the actionable error.
|
||||
}
|
||||
}
|
||||
|
||||
private WorkerEnvelope CreateEnvelope(WorkerHello hello)
|
||||
{
|
||||
return CreateBaseEnvelope(hello);
|
||||
@@ -193,6 +260,21 @@ public sealed class WorkerPipeSession
|
||||
return CreateBaseEnvelope(fault);
|
||||
}
|
||||
|
||||
private WorkerEnvelope CreateEnvelope(WorkerCommandReply reply)
|
||||
{
|
||||
return CreateBaseEnvelope(reply);
|
||||
}
|
||||
|
||||
private WorkerEnvelope CreateEnvelope(WorkerEvent workerEvent)
|
||||
{
|
||||
return CreateBaseEnvelope(workerEvent);
|
||||
}
|
||||
|
||||
private WorkerEnvelope CreateEnvelope(WorkerShutdownAck shutdownAck)
|
||||
{
|
||||
return CreateBaseEnvelope(shutdownAck);
|
||||
}
|
||||
|
||||
private WorkerEnvelope CreateBaseEnvelope(WorkerHello body)
|
||||
{
|
||||
WorkerEnvelope envelope = CreateBaseEnvelope();
|
||||
@@ -214,6 +296,28 @@ public sealed class WorkerPipeSession
|
||||
return envelope;
|
||||
}
|
||||
|
||||
private WorkerEnvelope CreateBaseEnvelope(WorkerCommandReply body)
|
||||
{
|
||||
WorkerEnvelope envelope = CreateBaseEnvelope();
|
||||
envelope.CorrelationId = body.Reply?.CorrelationId ?? string.Empty;
|
||||
envelope.WorkerCommandReply = body;
|
||||
return envelope;
|
||||
}
|
||||
|
||||
private WorkerEnvelope CreateBaseEnvelope(WorkerEvent body)
|
||||
{
|
||||
WorkerEnvelope envelope = CreateBaseEnvelope();
|
||||
envelope.WorkerEvent = body.Clone();
|
||||
return envelope;
|
||||
}
|
||||
|
||||
private WorkerEnvelope CreateBaseEnvelope(WorkerShutdownAck body)
|
||||
{
|
||||
WorkerEnvelope envelope = CreateBaseEnvelope();
|
||||
envelope.WorkerShutdownAck = body;
|
||||
return envelope;
|
||||
}
|
||||
|
||||
private WorkerEnvelope CreateBaseEnvelope()
|
||||
{
|
||||
return new WorkerEnvelope
|
||||
@@ -246,6 +350,75 @@ public sealed class WorkerPipeSession
|
||||
}
|
||||
}
|
||||
|
||||
private async Task HandleCommandAsync(
|
||||
WorkerEnvelope envelope,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
if (_mxAccessStaSession is null)
|
||||
{
|
||||
throw new InvalidOperationException("MXAccess STA session is not initialized.");
|
||||
}
|
||||
|
||||
StaCommand command = new(
|
||||
_options.SessionId,
|
||||
envelope.CorrelationId,
|
||||
envelope.WorkerCommand.Command,
|
||||
envelope.WorkerCommand.EnqueueTimestamp,
|
||||
cancellationToken);
|
||||
|
||||
MxCommandReply mxReply = await _mxAccessStaSession
|
||||
.DispatchAsync(command)
|
||||
.ConfigureAwait(false);
|
||||
WorkerCommandReply reply = new()
|
||||
{
|
||||
Reply = mxReply,
|
||||
CompletedTimestamp = Timestamp.FromDateTime(DateTime.UtcNow),
|
||||
};
|
||||
|
||||
await _writer.WriteAsync(CreateEnvelope(reply), cancellationToken).ConfigureAwait(false);
|
||||
await DrainEventsAsync(cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
private async Task HandleShutdownAsync(
|
||||
WorkerShutdown shutdown,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
TimeSpan gracePeriod = ResolveGracePeriod(shutdown);
|
||||
try
|
||||
{
|
||||
MxAccessShutdownResult result = _mxAccessStaSession is null
|
||||
? new MxAccessShutdownResult(Array.Empty<MxAccessShutdownFailure>())
|
||||
: await _mxAccessStaSession
|
||||
.ShutdownGracefullyAsync(gracePeriod, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
LogShutdownFailures(result.Failures);
|
||||
await _writer
|
||||
.WriteAsync(CreateEnvelope(CreateShutdownAck(result)), cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
_shutdownCompleted = true;
|
||||
}
|
||||
catch (TimeoutException exception)
|
||||
{
|
||||
_shutdownTimedOut = true;
|
||||
await TryWriteFaultAsync(CreateShutdownTimeoutFault(exception), cancellationToken).ConfigureAwait(false);
|
||||
throw;
|
||||
}
|
||||
}
|
||||
|
||||
private async Task DrainEventsAsync(CancellationToken cancellationToken)
|
||||
{
|
||||
if (_mxAccessStaSession is null)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
foreach (WorkerEvent workerEvent in _mxAccessStaSession.DrainEvents(maxEvents: 0))
|
||||
{
|
||||
await _writer.WriteAsync(CreateEnvelope(workerEvent), cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
|
||||
private WorkerReady CreateWorkerReady()
|
||||
{
|
||||
return new WorkerReady
|
||||
@@ -257,6 +430,49 @@ public sealed class WorkerPipeSession
|
||||
};
|
||||
}
|
||||
|
||||
private static TimeSpan ResolveGracePeriod(WorkerShutdown shutdown)
|
||||
{
|
||||
if (shutdown.GracePeriod is null)
|
||||
{
|
||||
return TimeSpan.FromSeconds(10);
|
||||
}
|
||||
|
||||
TimeSpan gracePeriod = shutdown.GracePeriod.ToTimeSpan();
|
||||
return gracePeriod <= TimeSpan.Zero
|
||||
? TimeSpan.FromSeconds(10)
|
||||
: gracePeriod;
|
||||
}
|
||||
|
||||
private static WorkerShutdownAck CreateShutdownAck(MxAccessShutdownResult result)
|
||||
{
|
||||
return new WorkerShutdownAck
|
||||
{
|
||||
Status = new ProtocolStatus
|
||||
{
|
||||
Code = ProtocolStatusCode.Ok,
|
||||
Message = result.Succeeded
|
||||
? "Graceful shutdown completed."
|
||||
: $"Graceful shutdown completed with {result.Failures.Count} cleanup failure(s).",
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
private void LogShutdownFailures(IReadOnlyList<MxAccessShutdownFailure> failures)
|
||||
{
|
||||
foreach (MxAccessShutdownFailure failure in failures)
|
||||
{
|
||||
_logger?.Error("WorkerGracefulShutdownCleanupFailed", new Dictionary<string, object?>
|
||||
{
|
||||
["session_id"] = _options.SessionId,
|
||||
["operation"] = failure.Operation,
|
||||
["server_handle"] = failure.ServerHandle,
|
||||
["item_handle"] = failure.ItemHandle,
|
||||
["exception_type"] = failure.ExceptionType,
|
||||
["hresult"] = failure.HResult,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
private static WorkerFault CreateFault(WorkerFrameProtocolException exception)
|
||||
{
|
||||
return new WorkerFault
|
||||
@@ -295,6 +511,22 @@ public sealed class WorkerPipeSession
|
||||
return fault;
|
||||
}
|
||||
|
||||
private static WorkerFault CreateShutdownTimeoutFault(TimeoutException exception)
|
||||
{
|
||||
string message = exception.Message;
|
||||
return new WorkerFault
|
||||
{
|
||||
Category = WorkerFaultCategory.ShutdownTimeout,
|
||||
ExceptionType = exception.GetType().FullName ?? string.Empty,
|
||||
DiagnosticMessage = message,
|
||||
ProtocolStatus = new ProtocolStatus
|
||||
{
|
||||
Code = ProtocolStatusCode.WorkerUnavailable,
|
||||
Message = message,
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
private static WorkerFaultCategory MapFaultCategory(WorkerFrameProtocolErrorCode errorCode)
|
||||
{
|
||||
return errorCode switch
|
||||
|
||||
Reference in New Issue
Block a user