Add Polly resilience policies

This commit is contained in:
Joseph Doherty
2026-04-27 15:37:56 -04:00
parent d431ff9660
commit bd4a09a35e
22 changed files with 611 additions and 21 deletions
@@ -80,6 +80,18 @@ public sealed class GatewayOptionsValidator : IValidateOptions<GatewayOptions>
options.StartupTimeoutSeconds,
"MxGateway:Worker:StartupTimeoutSeconds must be greater than zero.",
failures);
AddIfNotPositive(
options.StartupProbeRetryAttempts,
"MxGateway:Worker:StartupProbeRetryAttempts must be greater than zero.",
failures);
AddIfNotPositive(
options.StartupProbeRetryDelayMilliseconds,
"MxGateway:Worker:StartupProbeRetryDelayMilliseconds must be greater than zero.",
failures);
AddIfNotPositive(
options.PipeConnectAttemptTimeoutMilliseconds,
"MxGateway:Worker:PipeConnectAttemptTimeoutMilliseconds must be greater than zero.",
failures);
AddIfNotPositive(
options.ShutdownTimeoutSeconds,
"MxGateway:Worker:ShutdownTimeoutSeconds must be greater than zero.",
@@ -11,6 +11,12 @@ public sealed class WorkerOptions
public int StartupTimeoutSeconds { get; init; } = 30;
public int StartupProbeRetryAttempts { get; init; } = 3;
public int StartupProbeRetryDelayMilliseconds { get; init; } = 250;
public int PipeConnectAttemptTimeoutMilliseconds { get; init; } = 2000;
public int ShutdownTimeoutSeconds { get; init; } = 10;
public int HeartbeatIntervalSeconds { get; init; } = 5;
+18 -1
View File
@@ -20,11 +20,13 @@ public sealed class GatewayMetrics : IDisposable
private readonly Counter<long> _workerExitsCounter;
private readonly Counter<long> _heartbeatFailuresCounter;
private readonly Counter<long> _streamDisconnectsCounter;
private readonly Counter<long> _retryAttemptsCounter;
private readonly Histogram<double> _workerStartupLatencyHistogram;
private readonly Histogram<double> _commandLatencyHistogram;
private readonly Histogram<double> _eventStreamSendLatencyHistogram;
private readonly Dictionary<string, long> _commandFailuresByMethod = new(StringComparer.OrdinalIgnoreCase);
private readonly Dictionary<string, long> _eventsByFamily = new(StringComparer.OrdinalIgnoreCase);
private readonly Dictionary<string, long> _retryAttemptsByArea = new(StringComparer.OrdinalIgnoreCase);
private int _openSessions;
private int _workersRunning;
@@ -41,6 +43,7 @@ public sealed class GatewayMetrics : IDisposable
private long _workerExits;
private long _heartbeatFailures;
private long _streamDisconnects;
private long _retryAttempts;
private bool _disposed;
public GatewayMetrics()
@@ -58,6 +61,7 @@ public sealed class GatewayMetrics : IDisposable
_workerExitsCounter = _meter.CreateCounter<long>("mxgateway.workers.exited");
_heartbeatFailuresCounter = _meter.CreateCounter<long>("mxgateway.heartbeats.failed");
_streamDisconnectsCounter = _meter.CreateCounter<long>("mxgateway.grpc.streams.disconnected");
_retryAttemptsCounter = _meter.CreateCounter<long>("mxgateway.retries.attempted");
_workerStartupLatencyHistogram = _meter.CreateHistogram<double>("mxgateway.workers.startup.duration", "ms");
_commandLatencyHistogram = _meter.CreateHistogram<double>("mxgateway.commands.duration", "ms");
_eventStreamSendLatencyHistogram = _meter.CreateHistogram<double>("mxgateway.events.stream_send.duration", "ms");
@@ -238,6 +242,17 @@ public sealed class GatewayMetrics : IDisposable
_streamDisconnectsCounter.Add(1, new KeyValuePair<string, object?>("reason", reason));
}
public void RetryAttempted(string area)
{
lock (_syncRoot)
{
_retryAttempts++;
Increment(_retryAttemptsByArea, area);
}
_retryAttemptsCounter.Add(1, new KeyValuePair<string, object?>("area", area));
}
public GatewayMetricsSnapshot GetSnapshot()
{
lock (_syncRoot)
@@ -258,8 +273,10 @@ public sealed class GatewayMetrics : IDisposable
WorkerExits: _workerExits,
HeartbeatFailures: _heartbeatFailures,
StreamDisconnects: _streamDisconnects,
RetryAttempts: _retryAttempts,
CommandFailuresByMethod: new Dictionary<string, long>(_commandFailuresByMethod, StringComparer.OrdinalIgnoreCase),
EventsByFamily: new Dictionary<string, long>(_eventsByFamily, StringComparer.OrdinalIgnoreCase));
EventsByFamily: new Dictionary<string, long>(_eventsByFamily, StringComparer.OrdinalIgnoreCase),
RetryAttemptsByArea: new Dictionary<string, long>(_retryAttemptsByArea, StringComparer.OrdinalIgnoreCase));
}
}
@@ -16,5 +16,7 @@ public sealed record GatewayMetricsSnapshot(
long WorkerExits,
long HeartbeatFailures,
long StreamDisconnects,
long RetryAttempts,
IReadOnlyDictionary<string, long> CommandFailuresByMethod,
IReadOnlyDictionary<string, long> EventsByFamily);
IReadOnlyDictionary<string, long> EventsByFamily,
IReadOnlyDictionary<string, long> RetryAttemptsByArea);
@@ -7,6 +7,7 @@
<ItemGroup>
<PackageReference Include="Grpc.AspNetCore" Version="2.76.0" />
<PackageReference Include="Microsoft.Data.Sqlite" Version="10.0.7" />
<PackageReference Include="Polly.Core" Version="8.6.6" />
</ItemGroup>
<ItemGroup>
@@ -1,25 +1,33 @@
using System.Diagnostics;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Logging.Abstractions;
using Microsoft.Extensions.Options;
using MxGateway.Server.Configuration;
using MxGateway.Server.Metrics;
using Polly;
using Polly.Retry;
namespace MxGateway.Server.Workers;
public sealed class WorkerProcessLauncher : IWorkerProcessLauncher
{
public const string WorkerNonceEnvironmentVariableName = "MXGATEWAY_WORKER_NONCE";
public const string WorkerPipeConnectAttemptTimeoutEnvironmentVariableName =
"MXGATEWAY_WORKER_PIPE_CONNECT_ATTEMPT_TIMEOUT_MS";
private readonly IWorkerProcessFactory _processFactory;
private readonly IWorkerStartupProbe _startupProbe;
private readonly GatewayMetrics _metrics;
private readonly TimeProvider _timeProvider;
private readonly WorkerOptions _workerOptions;
private readonly ILogger<WorkerProcessLauncher> _logger;
public WorkerProcessLauncher(
IOptions<GatewayOptions> gatewayOptions,
IWorkerProcessFactory processFactory,
IWorkerStartupProbe startupProbe,
GatewayMetrics metrics,
ILogger<WorkerProcessLauncher>? logger = null,
TimeProvider? timeProvider = null)
{
ArgumentNullException.ThrowIfNull(gatewayOptions);
@@ -32,6 +40,7 @@ public sealed class WorkerProcessLauncher : IWorkerProcessLauncher
_startupProbe = startupProbe;
_metrics = metrics;
_timeProvider = timeProvider ?? TimeProvider.System;
_logger = logger ?? NullLogger<WorkerProcessLauncher>.Instance;
}
public async Task<WorkerProcessHandle> LaunchAsync(
@@ -76,8 +85,15 @@ public sealed class WorkerProcessLauncher : IWorkerProcessLauncher
using CancellationTokenSource startupTimeout = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken);
startupTimeout.CancelAfter(TimeSpan.FromSeconds(_workerOptions.StartupTimeoutSeconds));
await _startupProbe
.WaitUntilReadyAsync(process, request, startupTimeout.Token)
await CreateStartupProbePipeline(process)
.ExecuteAsync(
async token =>
{
await _startupProbe
.WaitUntilReadyAsync(process, request, token)
.ConfigureAwait(false);
},
startupTimeout.Token)
.ConfigureAwait(false);
_metrics.WorkerStarted(_timeProvider.GetUtcNow() - startedAt);
@@ -143,6 +159,8 @@ public sealed class WorkerProcessLauncher : IWorkerProcessLauncher
}
startInfo.Environment[WorkerNonceEnvironmentVariableName] = request.Nonce;
startInfo.Environment[WorkerPipeConnectAttemptTimeoutEnvironmentVariableName] =
_workerOptions.PipeConnectAttemptTimeoutMilliseconds.ToString(System.Globalization.CultureInfo.InvariantCulture);
commandLine = new WorkerProcessCommandLine(executablePath, arguments);
@@ -229,6 +247,43 @@ public sealed class WorkerProcessLauncher : IWorkerProcessLauncher
}
}
private ResiliencePipeline CreateStartupProbePipeline(IWorkerProcess process)
{
RetryStrategyOptions retryOptions = new()
{
MaxRetryAttempts = Math.Max(0, _workerOptions.StartupProbeRetryAttempts - 1),
BackoffType = DelayBackoffType.Exponential,
UseJitter = true,
Delay = TimeSpan.FromMilliseconds(_workerOptions.StartupProbeRetryDelayMilliseconds),
MaxDelay = TimeSpan.FromSeconds(2),
ShouldHandle = new PredicateBuilder().Handle<Exception>(exception =>
ShouldRetryStartupProbe(exception, process)),
OnRetry = args =>
{
_metrics.RetryAttempted("worker_startup");
_logger.LogDebug(
args.Outcome.Exception,
"Retrying worker startup probe after transient failure. Attempt {Attempt}.",
args.AttemptNumber + 1);
return default;
},
};
return new ResiliencePipelineBuilder()
.AddRetry(retryOptions)
.Build();
}
private static bool ShouldRetryStartupProbe(Exception exception, IWorkerProcess process)
{
if (exception is OperationCanceledException or WorkerProcessLaunchException)
{
return false;
}
return !process.HasExited;
}
private static void ValidateRequest(WorkerProcessLaunchRequest request)
{
if (string.IsNullOrWhiteSpace(request.SessionId))