Implement worker heartbeat watchdog
This commit is contained in:
@@ -6,6 +6,7 @@ using System.Threading.Tasks;
|
||||
using Google.Protobuf.WellKnownTypes;
|
||||
using MxGateway.Contracts.Proto;
|
||||
using MxGateway.Worker.MxAccess;
|
||||
using MxGateway.Worker.Sta;
|
||||
|
||||
namespace MxGateway.Worker.Ipc;
|
||||
|
||||
@@ -13,10 +14,14 @@ public sealed class WorkerPipeSession
|
||||
{
|
||||
private readonly WorkerFrameProtocolOptions _options;
|
||||
private readonly Func<int> _processIdProvider;
|
||||
private readonly Func<IWorkerRuntimeSession> _runtimeSessionFactory;
|
||||
private readonly WorkerPipeSessionOptions _sessionOptions;
|
||||
private readonly WorkerFrameReader _reader;
|
||||
private readonly WorkerFrameWriter _writer;
|
||||
private MxAccessStaSession? _mxAccessStaSession;
|
||||
private IWorkerRuntimeSession? _runtimeSession;
|
||||
private long _nextSequence;
|
||||
private WorkerState _state = WorkerState.Starting;
|
||||
private bool _watchdogFaultSent;
|
||||
|
||||
public WorkerPipeSession(
|
||||
Stream stream,
|
||||
@@ -34,11 +39,49 @@ public sealed class WorkerPipeSession
|
||||
WorkerFrameWriter writer,
|
||||
WorkerFrameProtocolOptions options,
|
||||
Func<int> processIdProvider)
|
||||
: this(
|
||||
reader,
|
||||
writer,
|
||||
options,
|
||||
processIdProvider,
|
||||
new WorkerPipeSessionOptions(),
|
||||
() => new MxAccessStaSession())
|
||||
{
|
||||
}
|
||||
|
||||
public WorkerPipeSession(
|
||||
WorkerFrameReader reader,
|
||||
WorkerFrameWriter writer,
|
||||
WorkerFrameProtocolOptions options,
|
||||
Func<int> processIdProvider,
|
||||
WorkerPipeSessionOptions sessionOptions,
|
||||
Func<IWorkerRuntimeSession> runtimeSessionFactory)
|
||||
{
|
||||
_reader = reader ?? throw new ArgumentNullException(nameof(reader));
|
||||
_writer = writer ?? throw new ArgumentNullException(nameof(writer));
|
||||
_options = options ?? throw new ArgumentNullException(nameof(options));
|
||||
_processIdProvider = processIdProvider ?? throw new ArgumentNullException(nameof(processIdProvider));
|
||||
_sessionOptions = sessionOptions ?? throw new ArgumentNullException(nameof(sessionOptions));
|
||||
_runtimeSessionFactory = runtimeSessionFactory ?? throw new ArgumentNullException(nameof(runtimeSessionFactory));
|
||||
_sessionOptions.Validate();
|
||||
}
|
||||
|
||||
public async Task RunAsync(CancellationToken cancellationToken = default)
|
||||
{
|
||||
_runtimeSession = _runtimeSessionFactory();
|
||||
try
|
||||
{
|
||||
await CompleteStartupHandshakeAsync(
|
||||
token => _runtimeSession.StartAsync(_processIdProvider(), token),
|
||||
cancellationToken).ConfigureAwait(false);
|
||||
await RunMessageLoopAsync(cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
finally
|
||||
{
|
||||
_runtimeSession?.Dispose();
|
||||
_runtimeSession = null;
|
||||
_state = WorkerState.Stopped;
|
||||
}
|
||||
}
|
||||
|
||||
public Task CompleteStartupHandshakeAsync(CancellationToken cancellationToken = default)
|
||||
@@ -76,11 +119,14 @@ public sealed class WorkerPipeSession
|
||||
try
|
||||
{
|
||||
WorkerEnvelope envelope = await _reader.ReadAsync(cancellationToken).ConfigureAwait(false);
|
||||
_state = WorkerState.Handshaking;
|
||||
ValidateGatewayHello(envelope);
|
||||
|
||||
await WriteWorkerHelloAsync(cancellationToken).ConfigureAwait(false);
|
||||
_state = WorkerState.InitializingSta;
|
||||
WorkerReady ready = await initializeMxAccessAsync(cancellationToken).ConfigureAwait(false);
|
||||
await WriteWorkerReadyAsync(ready, cancellationToken).ConfigureAwait(false);
|
||||
_state = WorkerState.Ready;
|
||||
}
|
||||
catch (WorkerFrameProtocolException exception)
|
||||
{
|
||||
@@ -140,6 +186,174 @@ public sealed class WorkerPipeSession
|
||||
return _writer.WriteAsync(CreateEnvelope(ready), cancellationToken);
|
||||
}
|
||||
|
||||
private async Task RunMessageLoopAsync(CancellationToken cancellationToken)
|
||||
{
|
||||
using CancellationTokenSource heartbeatCancellation = CancellationTokenSource
|
||||
.CreateLinkedTokenSource(cancellationToken);
|
||||
Task heartbeatTask = RunHeartbeatLoopAsync(heartbeatCancellation.Token);
|
||||
|
||||
try
|
||||
{
|
||||
while (!cancellationToken.IsCancellationRequested)
|
||||
{
|
||||
Task<WorkerEnvelope> readTask = _reader.ReadAsync(cancellationToken);
|
||||
Task completedTask = await Task.WhenAny(readTask, heartbeatTask).ConfigureAwait(false);
|
||||
if (completedTask == heartbeatTask)
|
||||
{
|
||||
await heartbeatTask.ConfigureAwait(false);
|
||||
}
|
||||
|
||||
WorkerEnvelope envelope = await readTask.ConfigureAwait(false);
|
||||
bool keepReading = await DispatchGatewayEnvelopeAsync(envelope, cancellationToken).ConfigureAwait(false);
|
||||
if (!keepReading)
|
||||
{
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
finally
|
||||
{
|
||||
heartbeatCancellation.Cancel();
|
||||
try
|
||||
{
|
||||
await heartbeatTask.ConfigureAwait(false);
|
||||
}
|
||||
catch (OperationCanceledException)
|
||||
{
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private async Task<bool> DispatchGatewayEnvelopeAsync(
|
||||
WorkerEnvelope envelope,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
switch (envelope.BodyCase)
|
||||
{
|
||||
case WorkerEnvelope.BodyOneofCase.WorkerCommand:
|
||||
_ = ProcessCommandAsync(envelope, cancellationToken);
|
||||
return true;
|
||||
case WorkerEnvelope.BodyOneofCase.WorkerShutdown:
|
||||
await ShutdownAsync(envelope.WorkerShutdown, cancellationToken).ConfigureAwait(false);
|
||||
return false;
|
||||
case WorkerEnvelope.BodyOneofCase.WorkerCancel:
|
||||
return true;
|
||||
default:
|
||||
throw new WorkerFrameProtocolException(
|
||||
WorkerFrameProtocolErrorCode.UnexpectedEnvelopeBody,
|
||||
$"Worker received unexpected gateway envelope body {envelope.BodyCase}.");
|
||||
}
|
||||
}
|
||||
|
||||
private async Task ProcessCommandAsync(
|
||||
WorkerEnvelope envelope,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
IWorkerRuntimeSession runtimeSession = _runtimeSession
|
||||
?? throw new InvalidOperationException("Worker runtime session has not been initialized.");
|
||||
WorkerCommand workerCommand = envelope.WorkerCommand;
|
||||
MxCommand command = workerCommand.Command;
|
||||
StaCommand staCommand = new(
|
||||
_options.SessionId,
|
||||
envelope.CorrelationId,
|
||||
command,
|
||||
workerCommand.EnqueueTimestamp,
|
||||
cancellationToken);
|
||||
|
||||
try
|
||||
{
|
||||
MxCommandReply reply = await runtimeSession.DispatchAsync(staCommand).ConfigureAwait(false);
|
||||
await _writer
|
||||
.WriteAsync(
|
||||
CreateEnvelope(new WorkerCommandReply
|
||||
{
|
||||
Reply = reply,
|
||||
CompletedTimestamp = Timestamp.FromDateTime(DateTime.UtcNow),
|
||||
}),
|
||||
cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
}
|
||||
catch (Exception exception) when (exception is not OperationCanceledException)
|
||||
{
|
||||
_state = WorkerState.Faulted;
|
||||
await TryWriteFaultAsync(
|
||||
CreateFault(
|
||||
WorkerFaultCategory.MxaccessCommandFailed,
|
||||
staCommand.MethodName,
|
||||
exception),
|
||||
cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
|
||||
private async Task ShutdownAsync(
|
||||
WorkerShutdown shutdown,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
_state = WorkerState.ShuttingDown;
|
||||
_runtimeSession?.RequestShutdown();
|
||||
|
||||
await _writer
|
||||
.WriteAsync(
|
||||
CreateEnvelope(
|
||||
new WorkerShutdownAck
|
||||
{
|
||||
Status = new ProtocolStatus
|
||||
{
|
||||
Code = ProtocolStatusCode.Ok,
|
||||
Message = string.IsNullOrWhiteSpace(shutdown.Reason)
|
||||
? "Worker shutdown accepted."
|
||||
: $"Worker shutdown accepted: {shutdown.Reason}",
|
||||
},
|
||||
}),
|
||||
cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
}
|
||||
|
||||
private async Task RunHeartbeatLoopAsync(CancellationToken cancellationToken)
|
||||
{
|
||||
while (!cancellationToken.IsCancellationRequested)
|
||||
{
|
||||
await Task.Delay(_sessionOptions.HeartbeatInterval, cancellationToken).ConfigureAwait(false);
|
||||
IWorkerRuntimeSession? runtimeSession = _runtimeSession;
|
||||
if (runtimeSession is null)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
WorkerRuntimeHeartbeatSnapshot snapshot = runtimeSession.CaptureHeartbeat();
|
||||
await _writer
|
||||
.WriteAsync(CreateEnvelope(CreateHeartbeat(snapshot)), cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
await ReportWatchdogFaultIfNeededAsync(snapshot, cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
|
||||
private async Task ReportWatchdogFaultIfNeededAsync(
|
||||
WorkerRuntimeHeartbeatSnapshot snapshot,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
TimeSpan staleFor = DateTimeOffset.UtcNow - snapshot.LastStaActivityUtc;
|
||||
if (staleFor <= _sessionOptions.HeartbeatGrace)
|
||||
{
|
||||
_watchdogFaultSent = false;
|
||||
return;
|
||||
}
|
||||
|
||||
if (_watchdogFaultSent)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
_watchdogFaultSent = true;
|
||||
await TryWriteFaultAsync(
|
||||
CreateFault(
|
||||
WorkerFaultCategory.StaHung,
|
||||
snapshot.CurrentCommandCorrelationId,
|
||||
$"STA activity is stale by {staleFor}."),
|
||||
cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
private async Task TryWriteFaultAsync(
|
||||
WorkerFrameProtocolException exception,
|
||||
CancellationToken cancellationToken)
|
||||
@@ -178,6 +392,25 @@ public sealed class WorkerPipeSession
|
||||
}
|
||||
}
|
||||
|
||||
private async Task TryWriteFaultAsync(
|
||||
WorkerFault fault,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
try
|
||||
{
|
||||
await _writer
|
||||
.WriteAsync(CreateEnvelope(fault), cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
}
|
||||
catch (Exception faultWriteException) when (
|
||||
faultWriteException is IOException
|
||||
|| faultWriteException is ObjectDisposedException
|
||||
|| faultWriteException is WorkerFrameProtocolException)
|
||||
{
|
||||
// The runtime fault remains observable through worker exit or pipe closure.
|
||||
}
|
||||
}
|
||||
|
||||
private WorkerEnvelope CreateEnvelope(WorkerHello hello)
|
||||
{
|
||||
return CreateBaseEnvelope(hello);
|
||||
@@ -193,6 +426,21 @@ public sealed class WorkerPipeSession
|
||||
return CreateBaseEnvelope(fault);
|
||||
}
|
||||
|
||||
private WorkerEnvelope CreateEnvelope(WorkerCommandReply reply)
|
||||
{
|
||||
return CreateBaseEnvelope(reply);
|
||||
}
|
||||
|
||||
private WorkerEnvelope CreateEnvelope(WorkerShutdownAck shutdownAck)
|
||||
{
|
||||
return CreateBaseEnvelope(shutdownAck);
|
||||
}
|
||||
|
||||
private WorkerEnvelope CreateEnvelope(WorkerHeartbeat heartbeat)
|
||||
{
|
||||
return CreateBaseEnvelope(heartbeat);
|
||||
}
|
||||
|
||||
private WorkerEnvelope CreateBaseEnvelope(WorkerHello body)
|
||||
{
|
||||
WorkerEnvelope envelope = CreateBaseEnvelope();
|
||||
@@ -214,6 +462,28 @@ public sealed class WorkerPipeSession
|
||||
return envelope;
|
||||
}
|
||||
|
||||
private WorkerEnvelope CreateBaseEnvelope(WorkerCommandReply body)
|
||||
{
|
||||
WorkerEnvelope envelope = CreateBaseEnvelope();
|
||||
envelope.CorrelationId = body.Reply?.CorrelationId ?? string.Empty;
|
||||
envelope.WorkerCommandReply = body;
|
||||
return envelope;
|
||||
}
|
||||
|
||||
private WorkerEnvelope CreateBaseEnvelope(WorkerShutdownAck body)
|
||||
{
|
||||
WorkerEnvelope envelope = CreateBaseEnvelope();
|
||||
envelope.WorkerShutdownAck = body;
|
||||
return envelope;
|
||||
}
|
||||
|
||||
private WorkerEnvelope CreateBaseEnvelope(WorkerHeartbeat body)
|
||||
{
|
||||
WorkerEnvelope envelope = CreateBaseEnvelope();
|
||||
envelope.WorkerHeartbeat = body;
|
||||
return envelope;
|
||||
}
|
||||
|
||||
private WorkerEnvelope CreateBaseEnvelope()
|
||||
{
|
||||
return new WorkerEnvelope
|
||||
@@ -231,21 +501,39 @@ public sealed class WorkerPipeSession
|
||||
|
||||
private async Task<WorkerReady> InitializeMxAccessAsync(CancellationToken cancellationToken)
|
||||
{
|
||||
_mxAccessStaSession = new MxAccessStaSession();
|
||||
_runtimeSession = new MxAccessStaSession();
|
||||
try
|
||||
{
|
||||
return await _mxAccessStaSession
|
||||
return await _runtimeSession
|
||||
.StartAsync(_processIdProvider(), cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
}
|
||||
catch
|
||||
{
|
||||
_mxAccessStaSession.Dispose();
|
||||
_mxAccessStaSession = null;
|
||||
_runtimeSession.Dispose();
|
||||
_runtimeSession = null;
|
||||
throw;
|
||||
}
|
||||
}
|
||||
|
||||
private WorkerHeartbeat CreateHeartbeat(WorkerRuntimeHeartbeatSnapshot snapshot)
|
||||
{
|
||||
WorkerState state = string.IsNullOrWhiteSpace(snapshot.CurrentCommandCorrelationId)
|
||||
? _state
|
||||
: WorkerState.ExecutingCommand;
|
||||
|
||||
return new WorkerHeartbeat
|
||||
{
|
||||
WorkerProcessId = _processIdProvider(),
|
||||
State = state,
|
||||
LastStaActivityTimestamp = Timestamp.FromDateTimeOffset(snapshot.LastStaActivityUtc),
|
||||
PendingCommandCount = snapshot.PendingCommandCount,
|
||||
OutboundEventQueueDepth = snapshot.OutboundEventQueueDepth,
|
||||
LastEventSequence = snapshot.LastEventSequence,
|
||||
CurrentCommandCorrelationId = snapshot.CurrentCommandCorrelationId,
|
||||
};
|
||||
}
|
||||
|
||||
private WorkerReady CreateWorkerReady()
|
||||
{
|
||||
return new WorkerReady
|
||||
@@ -295,6 +583,42 @@ public sealed class WorkerPipeSession
|
||||
return fault;
|
||||
}
|
||||
|
||||
private static WorkerFault CreateFault(
|
||||
WorkerFaultCategory category,
|
||||
string commandMethod,
|
||||
Exception exception)
|
||||
{
|
||||
WorkerFault fault = CreateFault(
|
||||
category,
|
||||
commandMethod,
|
||||
exception.Message);
|
||||
fault.ExceptionType = exception.GetType().FullName ?? string.Empty;
|
||||
fault.ProtocolStatus = new ProtocolStatus
|
||||
{
|
||||
Code = ProtocolStatusCode.WorkerUnavailable,
|
||||
Message = exception.Message,
|
||||
};
|
||||
return fault;
|
||||
}
|
||||
|
||||
private static WorkerFault CreateFault(
|
||||
WorkerFaultCategory category,
|
||||
string commandMethod,
|
||||
string diagnosticMessage)
|
||||
{
|
||||
return new WorkerFault
|
||||
{
|
||||
Category = category,
|
||||
CommandMethod = commandMethod ?? string.Empty,
|
||||
DiagnosticMessage = diagnosticMessage,
|
||||
ProtocolStatus = new ProtocolStatus
|
||||
{
|
||||
Code = ProtocolStatusCode.WorkerUnavailable,
|
||||
Message = diagnosticMessage,
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
private static WorkerFaultCategory MapFaultCategory(WorkerFrameProtocolErrorCode errorCode)
|
||||
{
|
||||
return errorCode switch
|
||||
|
||||
Reference in New Issue
Block a user