Add Polly resilience policies
This commit is contained in:
@@ -80,6 +80,18 @@ public sealed class GatewayOptionsValidator : IValidateOptions<GatewayOptions>
|
||||
options.StartupTimeoutSeconds,
|
||||
"MxGateway:Worker:StartupTimeoutSeconds must be greater than zero.",
|
||||
failures);
|
||||
AddIfNotPositive(
|
||||
options.StartupProbeRetryAttempts,
|
||||
"MxGateway:Worker:StartupProbeRetryAttempts must be greater than zero.",
|
||||
failures);
|
||||
AddIfNotPositive(
|
||||
options.StartupProbeRetryDelayMilliseconds,
|
||||
"MxGateway:Worker:StartupProbeRetryDelayMilliseconds must be greater than zero.",
|
||||
failures);
|
||||
AddIfNotPositive(
|
||||
options.PipeConnectAttemptTimeoutMilliseconds,
|
||||
"MxGateway:Worker:PipeConnectAttemptTimeoutMilliseconds must be greater than zero.",
|
||||
failures);
|
||||
AddIfNotPositive(
|
||||
options.ShutdownTimeoutSeconds,
|
||||
"MxGateway:Worker:ShutdownTimeoutSeconds must be greater than zero.",
|
||||
|
||||
@@ -11,6 +11,12 @@ public sealed class WorkerOptions
|
||||
|
||||
public int StartupTimeoutSeconds { get; init; } = 30;
|
||||
|
||||
public int StartupProbeRetryAttempts { get; init; } = 3;
|
||||
|
||||
public int StartupProbeRetryDelayMilliseconds { get; init; } = 250;
|
||||
|
||||
public int PipeConnectAttemptTimeoutMilliseconds { get; init; } = 2000;
|
||||
|
||||
public int ShutdownTimeoutSeconds { get; init; } = 10;
|
||||
|
||||
public int HeartbeatIntervalSeconds { get; init; } = 5;
|
||||
|
||||
@@ -20,11 +20,13 @@ public sealed class GatewayMetrics : IDisposable
|
||||
private readonly Counter<long> _workerExitsCounter;
|
||||
private readonly Counter<long> _heartbeatFailuresCounter;
|
||||
private readonly Counter<long> _streamDisconnectsCounter;
|
||||
private readonly Counter<long> _retryAttemptsCounter;
|
||||
private readonly Histogram<double> _workerStartupLatencyHistogram;
|
||||
private readonly Histogram<double> _commandLatencyHistogram;
|
||||
private readonly Histogram<double> _eventStreamSendLatencyHistogram;
|
||||
private readonly Dictionary<string, long> _commandFailuresByMethod = new(StringComparer.OrdinalIgnoreCase);
|
||||
private readonly Dictionary<string, long> _eventsByFamily = new(StringComparer.OrdinalIgnoreCase);
|
||||
private readonly Dictionary<string, long> _retryAttemptsByArea = new(StringComparer.OrdinalIgnoreCase);
|
||||
|
||||
private int _openSessions;
|
||||
private int _workersRunning;
|
||||
@@ -41,6 +43,7 @@ public sealed class GatewayMetrics : IDisposable
|
||||
private long _workerExits;
|
||||
private long _heartbeatFailures;
|
||||
private long _streamDisconnects;
|
||||
private long _retryAttempts;
|
||||
private bool _disposed;
|
||||
|
||||
public GatewayMetrics()
|
||||
@@ -58,6 +61,7 @@ public sealed class GatewayMetrics : IDisposable
|
||||
_workerExitsCounter = _meter.CreateCounter<long>("mxgateway.workers.exited");
|
||||
_heartbeatFailuresCounter = _meter.CreateCounter<long>("mxgateway.heartbeats.failed");
|
||||
_streamDisconnectsCounter = _meter.CreateCounter<long>("mxgateway.grpc.streams.disconnected");
|
||||
_retryAttemptsCounter = _meter.CreateCounter<long>("mxgateway.retries.attempted");
|
||||
_workerStartupLatencyHistogram = _meter.CreateHistogram<double>("mxgateway.workers.startup.duration", "ms");
|
||||
_commandLatencyHistogram = _meter.CreateHistogram<double>("mxgateway.commands.duration", "ms");
|
||||
_eventStreamSendLatencyHistogram = _meter.CreateHistogram<double>("mxgateway.events.stream_send.duration", "ms");
|
||||
@@ -238,6 +242,17 @@ public sealed class GatewayMetrics : IDisposable
|
||||
_streamDisconnectsCounter.Add(1, new KeyValuePair<string, object?>("reason", reason));
|
||||
}
|
||||
|
||||
public void RetryAttempted(string area)
|
||||
{
|
||||
lock (_syncRoot)
|
||||
{
|
||||
_retryAttempts++;
|
||||
Increment(_retryAttemptsByArea, area);
|
||||
}
|
||||
|
||||
_retryAttemptsCounter.Add(1, new KeyValuePair<string, object?>("area", area));
|
||||
}
|
||||
|
||||
public GatewayMetricsSnapshot GetSnapshot()
|
||||
{
|
||||
lock (_syncRoot)
|
||||
@@ -258,8 +273,10 @@ public sealed class GatewayMetrics : IDisposable
|
||||
WorkerExits: _workerExits,
|
||||
HeartbeatFailures: _heartbeatFailures,
|
||||
StreamDisconnects: _streamDisconnects,
|
||||
RetryAttempts: _retryAttempts,
|
||||
CommandFailuresByMethod: new Dictionary<string, long>(_commandFailuresByMethod, StringComparer.OrdinalIgnoreCase),
|
||||
EventsByFamily: new Dictionary<string, long>(_eventsByFamily, StringComparer.OrdinalIgnoreCase));
|
||||
EventsByFamily: new Dictionary<string, long>(_eventsByFamily, StringComparer.OrdinalIgnoreCase),
|
||||
RetryAttemptsByArea: new Dictionary<string, long>(_retryAttemptsByArea, StringComparer.OrdinalIgnoreCase));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -16,5 +16,7 @@ public sealed record GatewayMetricsSnapshot(
|
||||
long WorkerExits,
|
||||
long HeartbeatFailures,
|
||||
long StreamDisconnects,
|
||||
long RetryAttempts,
|
||||
IReadOnlyDictionary<string, long> CommandFailuresByMethod,
|
||||
IReadOnlyDictionary<string, long> EventsByFamily);
|
||||
IReadOnlyDictionary<string, long> EventsByFamily,
|
||||
IReadOnlyDictionary<string, long> RetryAttemptsByArea);
|
||||
|
||||
@@ -7,6 +7,7 @@
|
||||
<ItemGroup>
|
||||
<PackageReference Include="Grpc.AspNetCore" Version="2.76.0" />
|
||||
<PackageReference Include="Microsoft.Data.Sqlite" Version="10.0.7" />
|
||||
<PackageReference Include="Polly.Core" Version="8.6.6" />
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
|
||||
@@ -1,25 +1,33 @@
|
||||
using System.Diagnostics;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Logging.Abstractions;
|
||||
using Microsoft.Extensions.Options;
|
||||
using MxGateway.Server.Configuration;
|
||||
using MxGateway.Server.Metrics;
|
||||
using Polly;
|
||||
using Polly.Retry;
|
||||
|
||||
namespace MxGateway.Server.Workers;
|
||||
|
||||
public sealed class WorkerProcessLauncher : IWorkerProcessLauncher
|
||||
{
|
||||
public const string WorkerNonceEnvironmentVariableName = "MXGATEWAY_WORKER_NONCE";
|
||||
public const string WorkerPipeConnectAttemptTimeoutEnvironmentVariableName =
|
||||
"MXGATEWAY_WORKER_PIPE_CONNECT_ATTEMPT_TIMEOUT_MS";
|
||||
|
||||
private readonly IWorkerProcessFactory _processFactory;
|
||||
private readonly IWorkerStartupProbe _startupProbe;
|
||||
private readonly GatewayMetrics _metrics;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly WorkerOptions _workerOptions;
|
||||
private readonly ILogger<WorkerProcessLauncher> _logger;
|
||||
|
||||
public WorkerProcessLauncher(
|
||||
IOptions<GatewayOptions> gatewayOptions,
|
||||
IWorkerProcessFactory processFactory,
|
||||
IWorkerStartupProbe startupProbe,
|
||||
GatewayMetrics metrics,
|
||||
ILogger<WorkerProcessLauncher>? logger = null,
|
||||
TimeProvider? timeProvider = null)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(gatewayOptions);
|
||||
@@ -32,6 +40,7 @@ public sealed class WorkerProcessLauncher : IWorkerProcessLauncher
|
||||
_startupProbe = startupProbe;
|
||||
_metrics = metrics;
|
||||
_timeProvider = timeProvider ?? TimeProvider.System;
|
||||
_logger = logger ?? NullLogger<WorkerProcessLauncher>.Instance;
|
||||
}
|
||||
|
||||
public async Task<WorkerProcessHandle> LaunchAsync(
|
||||
@@ -76,8 +85,15 @@ public sealed class WorkerProcessLauncher : IWorkerProcessLauncher
|
||||
using CancellationTokenSource startupTimeout = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken);
|
||||
startupTimeout.CancelAfter(TimeSpan.FromSeconds(_workerOptions.StartupTimeoutSeconds));
|
||||
|
||||
await _startupProbe
|
||||
.WaitUntilReadyAsync(process, request, startupTimeout.Token)
|
||||
await CreateStartupProbePipeline(process)
|
||||
.ExecuteAsync(
|
||||
async token =>
|
||||
{
|
||||
await _startupProbe
|
||||
.WaitUntilReadyAsync(process, request, token)
|
||||
.ConfigureAwait(false);
|
||||
},
|
||||
startupTimeout.Token)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
_metrics.WorkerStarted(_timeProvider.GetUtcNow() - startedAt);
|
||||
@@ -143,6 +159,8 @@ public sealed class WorkerProcessLauncher : IWorkerProcessLauncher
|
||||
}
|
||||
|
||||
startInfo.Environment[WorkerNonceEnvironmentVariableName] = request.Nonce;
|
||||
startInfo.Environment[WorkerPipeConnectAttemptTimeoutEnvironmentVariableName] =
|
||||
_workerOptions.PipeConnectAttemptTimeoutMilliseconds.ToString(System.Globalization.CultureInfo.InvariantCulture);
|
||||
|
||||
commandLine = new WorkerProcessCommandLine(executablePath, arguments);
|
||||
|
||||
@@ -229,6 +247,43 @@ public sealed class WorkerProcessLauncher : IWorkerProcessLauncher
|
||||
}
|
||||
}
|
||||
|
||||
private ResiliencePipeline CreateStartupProbePipeline(IWorkerProcess process)
|
||||
{
|
||||
RetryStrategyOptions retryOptions = new()
|
||||
{
|
||||
MaxRetryAttempts = Math.Max(0, _workerOptions.StartupProbeRetryAttempts - 1),
|
||||
BackoffType = DelayBackoffType.Exponential,
|
||||
UseJitter = true,
|
||||
Delay = TimeSpan.FromMilliseconds(_workerOptions.StartupProbeRetryDelayMilliseconds),
|
||||
MaxDelay = TimeSpan.FromSeconds(2),
|
||||
ShouldHandle = new PredicateBuilder().Handle<Exception>(exception =>
|
||||
ShouldRetryStartupProbe(exception, process)),
|
||||
OnRetry = args =>
|
||||
{
|
||||
_metrics.RetryAttempted("worker_startup");
|
||||
_logger.LogDebug(
|
||||
args.Outcome.Exception,
|
||||
"Retrying worker startup probe after transient failure. Attempt {Attempt}.",
|
||||
args.AttemptNumber + 1);
|
||||
return default;
|
||||
},
|
||||
};
|
||||
|
||||
return new ResiliencePipelineBuilder()
|
||||
.AddRetry(retryOptions)
|
||||
.Build();
|
||||
}
|
||||
|
||||
private static bool ShouldRetryStartupProbe(Exception exception, IWorkerProcess process)
|
||||
{
|
||||
if (exception is OperationCanceledException or WorkerProcessLaunchException)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
return !process.HasExited;
|
||||
}
|
||||
|
||||
private static void ValidateRequest(WorkerProcessLaunchRequest request)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(request.SessionId))
|
||||
|
||||
Reference in New Issue
Block a user