Add Polly resilience policies
This commit is contained in:
@@ -36,11 +36,22 @@ internal sealed class FakeGatewayTransport(MxGatewayClientOptions options) : IMx
|
|||||||
ProtocolStatus = new ProtocolStatus { Code = ProtocolStatusCode.Ok },
|
ProtocolStatus = new ProtocolStatus { Code = ProtocolStatusCode.Ok },
|
||||||
};
|
};
|
||||||
|
|
||||||
|
public Queue<Exception> OpenSessionExceptions { get; } = new();
|
||||||
|
|
||||||
|
public Queue<Exception> CloseSessionExceptions { get; } = new();
|
||||||
|
|
||||||
|
public Queue<Exception> InvokeExceptions { get; } = new();
|
||||||
|
|
||||||
public Task<OpenSessionReply> OpenSessionAsync(
|
public Task<OpenSessionReply> OpenSessionAsync(
|
||||||
OpenSessionRequest request,
|
OpenSessionRequest request,
|
||||||
CallOptions callOptions)
|
CallOptions callOptions)
|
||||||
{
|
{
|
||||||
OpenSessionCalls.Add((request, callOptions));
|
OpenSessionCalls.Add((request, callOptions));
|
||||||
|
if (OpenSessionExceptions.TryDequeue(out Exception? exception))
|
||||||
|
{
|
||||||
|
throw exception;
|
||||||
|
}
|
||||||
|
|
||||||
return Task.FromResult(OpenSessionReply);
|
return Task.FromResult(OpenSessionReply);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -49,6 +60,11 @@ internal sealed class FakeGatewayTransport(MxGatewayClientOptions options) : IMx
|
|||||||
CallOptions callOptions)
|
CallOptions callOptions)
|
||||||
{
|
{
|
||||||
CloseSessionCalls.Add((request, callOptions));
|
CloseSessionCalls.Add((request, callOptions));
|
||||||
|
if (CloseSessionExceptions.TryDequeue(out Exception? exception))
|
||||||
|
{
|
||||||
|
throw exception;
|
||||||
|
}
|
||||||
|
|
||||||
return Task.FromResult(CloseSessionReply);
|
return Task.FromResult(CloseSessionReply);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -57,6 +73,11 @@ internal sealed class FakeGatewayTransport(MxGatewayClientOptions options) : IMx
|
|||||||
CallOptions callOptions)
|
CallOptions callOptions)
|
||||||
{
|
{
|
||||||
InvokeCalls.Add((request, callOptions));
|
InvokeCalls.Add((request, callOptions));
|
||||||
|
if (InvokeExceptions.TryDequeue(out Exception? exception))
|
||||||
|
{
|
||||||
|
throw exception;
|
||||||
|
}
|
||||||
|
|
||||||
return Task.FromResult(_invokeReplies.Dequeue());
|
return Task.FromResult(_invokeReplies.Dequeue());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -25,4 +25,17 @@ public sealed class MxGatewayClientOptionsTests
|
|||||||
|
|
||||||
Assert.Throws<ArgumentException>(options.Validate);
|
Assert.Throws<ArgumentException>(options.Validate);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public void Validate_WithInvalidRetryOptions_Throws()
|
||||||
|
{
|
||||||
|
var options = new MxGatewayClientOptions
|
||||||
|
{
|
||||||
|
Endpoint = new Uri("http://localhost:5000"),
|
||||||
|
ApiKey = "test-api-key",
|
||||||
|
Retry = new MxGatewayClientRetryOptions { MaxAttempts = 0 },
|
||||||
|
};
|
||||||
|
|
||||||
|
Assert.Throws<ArgumentOutOfRangeException>(options.Validate);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
using MxGateway.Contracts.Proto;
|
using MxGateway.Contracts.Proto;
|
||||||
|
using Grpc.Core;
|
||||||
|
|
||||||
namespace MxGateway.Client.Tests;
|
namespace MxGateway.Client.Tests;
|
||||||
|
|
||||||
@@ -220,6 +221,55 @@ public sealed class MxGatewayClientSessionTests
|
|||||||
Assert.Equal("session-fixture", call.Request.SessionId);
|
Assert.Equal("session-fixture", call.Request.SessionId);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public async Task InvokeAsync_RetriesSafeDiagnosticCommandOnTransientGrpcFailure()
|
||||||
|
{
|
||||||
|
FakeGatewayTransport transport = CreateTransport();
|
||||||
|
transport.InvokeExceptions.Enqueue(CreateTransientRpcException());
|
||||||
|
transport.AddInvokeReply(new MxCommandReply
|
||||||
|
{
|
||||||
|
SessionId = "session-fixture",
|
||||||
|
Kind = MxCommandKind.Ping,
|
||||||
|
ProtocolStatus = new ProtocolStatus { Code = ProtocolStatusCode.Ok },
|
||||||
|
});
|
||||||
|
await using MxGatewayClient client = CreateClient(transport);
|
||||||
|
MxGatewaySession session = await client.OpenSessionAsync();
|
||||||
|
|
||||||
|
await session.InvokeAsync(new MxCommandRequest
|
||||||
|
{
|
||||||
|
SessionId = session.SessionId,
|
||||||
|
Command = new MxCommand { Kind = MxCommandKind.Ping, Ping = new PingCommand() },
|
||||||
|
});
|
||||||
|
|
||||||
|
Assert.Equal(2, transport.InvokeCalls.Count);
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public async Task OpenSessionAsync_DoesNotRetryTransientGrpcFailure()
|
||||||
|
{
|
||||||
|
FakeGatewayTransport transport = CreateTransport();
|
||||||
|
transport.OpenSessionExceptions.Enqueue(CreateTransientRpcException());
|
||||||
|
await using MxGatewayClient client = CreateClient(transport);
|
||||||
|
|
||||||
|
await Assert.ThrowsAsync<RpcException>(async () => await client.OpenSessionAsync());
|
||||||
|
|
||||||
|
Assert.Single(transport.OpenSessionCalls);
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public async Task InvokeAsync_DoesNotRetryWriteCommand()
|
||||||
|
{
|
||||||
|
FakeGatewayTransport transport = CreateTransport();
|
||||||
|
transport.InvokeExceptions.Enqueue(CreateTransientRpcException());
|
||||||
|
await using MxGatewayClient client = CreateClient(transport);
|
||||||
|
MxGatewaySession session = await client.OpenSessionAsync();
|
||||||
|
|
||||||
|
await Assert.ThrowsAsync<RpcException>(async () =>
|
||||||
|
await session.WriteRawAsync(1, 2, 3.ToMxValue(), userId: 0));
|
||||||
|
|
||||||
|
Assert.Single(transport.InvokeCalls);
|
||||||
|
}
|
||||||
|
|
||||||
[Fact]
|
[Fact]
|
||||||
public async Task InvokeHelpers_PassCancellationTokenToTransport()
|
public async Task InvokeHelpers_PassCancellationTokenToTransport()
|
||||||
{
|
{
|
||||||
@@ -252,4 +302,9 @@ public sealed class MxGatewayClientSessionTests
|
|||||||
ApiKey = "test-api-key",
|
ApiKey = "test-api-key",
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static RpcException CreateTransientRpcException()
|
||||||
|
{
|
||||||
|
return new RpcException(new Status(StatusCode.Unavailable, "gateway unavailable"));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -7,6 +7,7 @@
|
|||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
<PackageReference Include="Grpc.Net.Client" Version="2.76.0" />
|
<PackageReference Include="Grpc.Net.Client" Version="2.76.0" />
|
||||||
<PackageReference Include="Microsoft.Extensions.Logging.Abstractions" Version="10.0.7" />
|
<PackageReference Include="Microsoft.Extensions.Logging.Abstractions" Version="10.0.7" />
|
||||||
|
<PackageReference Include="Polly.Core" Version="8.6.6" />
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
|
|
||||||
<PropertyGroup>
|
<PropertyGroup>
|
||||||
|
|||||||
@@ -1,6 +1,8 @@
|
|||||||
using Grpc.Core;
|
using Grpc.Core;
|
||||||
using Grpc.Net.Client;
|
using Grpc.Net.Client;
|
||||||
|
using Microsoft.Extensions.Logging;
|
||||||
using MxGateway.Contracts.Proto;
|
using MxGateway.Contracts.Proto;
|
||||||
|
using Polly;
|
||||||
|
|
||||||
namespace MxGateway.Client;
|
namespace MxGateway.Client;
|
||||||
|
|
||||||
@@ -11,6 +13,7 @@ public sealed class MxGatewayClient : IAsyncDisposable
|
|||||||
{
|
{
|
||||||
private readonly GrpcChannel _channel;
|
private readonly GrpcChannel _channel;
|
||||||
private readonly IMxGatewayClientTransport _transport;
|
private readonly IMxGatewayClientTransport _transport;
|
||||||
|
private readonly ResiliencePipeline _safeUnaryRetryPipeline;
|
||||||
private bool _disposed;
|
private bool _disposed;
|
||||||
|
|
||||||
internal MxGatewayClient(
|
internal MxGatewayClient(
|
||||||
@@ -22,6 +25,9 @@ public sealed class MxGatewayClient : IAsyncDisposable
|
|||||||
|
|
||||||
Options = options;
|
Options = options;
|
||||||
_transport = transport ?? throw new ArgumentNullException(nameof(transport));
|
_transport = transport ?? throw new ArgumentNullException(nameof(transport));
|
||||||
|
_safeUnaryRetryPipeline = MxGatewayClientRetryPolicy.Create(
|
||||||
|
options.Retry,
|
||||||
|
options.LoggerFactory?.CreateLogger<MxGatewayClient>());
|
||||||
_channel = null!;
|
_channel = null!;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -32,6 +38,9 @@ public sealed class MxGatewayClient : IAsyncDisposable
|
|||||||
_channel = channel;
|
_channel = channel;
|
||||||
_transport = transport;
|
_transport = transport;
|
||||||
Options = transport.Options;
|
Options = transport.Options;
|
||||||
|
_safeUnaryRetryPipeline = MxGatewayClientRetryPolicy.Create(
|
||||||
|
Options.Retry,
|
||||||
|
Options.LoggerFactory?.CreateLogger<MxGatewayClient>());
|
||||||
}
|
}
|
||||||
|
|
||||||
public MxGatewayClientOptions Options { get; }
|
public MxGatewayClientOptions Options { get; }
|
||||||
@@ -88,7 +97,9 @@ public sealed class MxGatewayClient : IAsyncDisposable
|
|||||||
ArgumentNullException.ThrowIfNull(request);
|
ArgumentNullException.ThrowIfNull(request);
|
||||||
ThrowIfDisposed();
|
ThrowIfDisposed();
|
||||||
|
|
||||||
return _transport.CloseSessionAsync(request, CreateCallOptions(cancellationToken));
|
return ExecuteSafeUnaryAsync(
|
||||||
|
token => _transport.CloseSessionAsync(request, CreateCallOptions(token)),
|
||||||
|
cancellationToken);
|
||||||
}
|
}
|
||||||
|
|
||||||
public Task<MxCommandReply> InvokeAsync(
|
public Task<MxCommandReply> InvokeAsync(
|
||||||
@@ -98,6 +109,13 @@ public sealed class MxGatewayClient : IAsyncDisposable
|
|||||||
ArgumentNullException.ThrowIfNull(request);
|
ArgumentNullException.ThrowIfNull(request);
|
||||||
ThrowIfDisposed();
|
ThrowIfDisposed();
|
||||||
|
|
||||||
|
if (MxGatewayClientRetryPolicy.IsRetryableCommand(request.Command?.Kind ?? MxCommandKind.Unspecified))
|
||||||
|
{
|
||||||
|
return ExecuteSafeUnaryAsync(
|
||||||
|
token => _transport.InvokeAsync(request, CreateCallOptions(token)),
|
||||||
|
cancellationToken);
|
||||||
|
}
|
||||||
|
|
||||||
return _transport.InvokeAsync(request, CreateCallOptions(cancellationToken));
|
return _transport.InvokeAsync(request, CreateCallOptions(cancellationToken));
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -136,6 +154,16 @@ public sealed class MxGatewayClient : IAsyncDisposable
|
|||||||
cancellationToken);
|
cancellationToken);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private Task<T> ExecuteSafeUnaryAsync<T>(
|
||||||
|
Func<CancellationToken, Task<T>> call,
|
||||||
|
CancellationToken cancellationToken)
|
||||||
|
{
|
||||||
|
return _safeUnaryRetryPipeline.ExecuteAsync(
|
||||||
|
async token => await call(token).ConfigureAwait(false),
|
||||||
|
cancellationToken)
|
||||||
|
.AsTask();
|
||||||
|
}
|
||||||
|
|
||||||
private void ThrowIfDisposed()
|
private void ThrowIfDisposed()
|
||||||
{
|
{
|
||||||
ObjectDisposedException.ThrowIf(_disposed, this);
|
ObjectDisposedException.ThrowIf(_disposed, this);
|
||||||
|
|||||||
@@ -21,6 +21,8 @@ public sealed class MxGatewayClientOptions
|
|||||||
|
|
||||||
public TimeSpan DefaultCallTimeout { get; init; } = TimeSpan.FromSeconds(30);
|
public TimeSpan DefaultCallTimeout { get; init; } = TimeSpan.FromSeconds(30);
|
||||||
|
|
||||||
|
public MxGatewayClientRetryOptions Retry { get; init; } = new();
|
||||||
|
|
||||||
public ILoggerFactory? LoggerFactory { get; init; }
|
public ILoggerFactory? LoggerFactory { get; init; }
|
||||||
|
|
||||||
public void Validate()
|
public void Validate()
|
||||||
@@ -54,5 +56,7 @@ public sealed class MxGatewayClientOptions
|
|||||||
nameof(DefaultCallTimeout),
|
nameof(DefaultCallTimeout),
|
||||||
"The default call timeout must be greater than zero.");
|
"The default call timeout must be greater than zero.");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Retry.Validate();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -0,0 +1,43 @@
|
|||||||
|
namespace MxGateway.Client;
|
||||||
|
|
||||||
|
public sealed class MxGatewayClientRetryOptions
|
||||||
|
{
|
||||||
|
public int MaxAttempts { get; init; } = 2;
|
||||||
|
|
||||||
|
public TimeSpan Delay { get; init; } = TimeSpan.FromMilliseconds(200);
|
||||||
|
|
||||||
|
public TimeSpan MaxDelay { get; init; } = TimeSpan.FromSeconds(2);
|
||||||
|
|
||||||
|
public bool UseJitter { get; init; } = true;
|
||||||
|
|
||||||
|
public void Validate()
|
||||||
|
{
|
||||||
|
if (MaxAttempts <= 0)
|
||||||
|
{
|
||||||
|
throw new ArgumentOutOfRangeException(
|
||||||
|
nameof(MaxAttempts),
|
||||||
|
"The retry max attempts value must be greater than zero.");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (Delay <= TimeSpan.Zero)
|
||||||
|
{
|
||||||
|
throw new ArgumentOutOfRangeException(
|
||||||
|
nameof(Delay),
|
||||||
|
"The retry delay must be greater than zero.");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (MaxDelay <= TimeSpan.Zero)
|
||||||
|
{
|
||||||
|
throw new ArgumentOutOfRangeException(
|
||||||
|
nameof(MaxDelay),
|
||||||
|
"The retry max delay must be greater than zero.");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (MaxDelay < Delay)
|
||||||
|
{
|
||||||
|
throw new ArgumentOutOfRangeException(
|
||||||
|
nameof(MaxDelay),
|
||||||
|
"The retry max delay must be greater than or equal to the retry delay.");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,62 @@
|
|||||||
|
using Grpc.Core;
|
||||||
|
using Microsoft.Extensions.Logging;
|
||||||
|
using MxGateway.Contracts.Proto;
|
||||||
|
using Polly;
|
||||||
|
using Polly.Retry;
|
||||||
|
|
||||||
|
namespace MxGateway.Client;
|
||||||
|
|
||||||
|
internal static class MxGatewayClientRetryPolicy
|
||||||
|
{
|
||||||
|
public static ResiliencePipeline Create(
|
||||||
|
MxGatewayClientRetryOptions options,
|
||||||
|
ILogger? logger)
|
||||||
|
{
|
||||||
|
ArgumentNullException.ThrowIfNull(options);
|
||||||
|
options.Validate();
|
||||||
|
|
||||||
|
return new ResiliencePipelineBuilder()
|
||||||
|
.AddRetry(new RetryStrategyOptions
|
||||||
|
{
|
||||||
|
MaxRetryAttempts = Math.Max(0, options.MaxAttempts - 1),
|
||||||
|
BackoffType = DelayBackoffType.Exponential,
|
||||||
|
UseJitter = options.UseJitter,
|
||||||
|
Delay = options.Delay,
|
||||||
|
MaxDelay = options.MaxDelay,
|
||||||
|
ShouldHandle = new PredicateBuilder().Handle<Exception>(IsTransientGrpcFailure),
|
||||||
|
OnRetry = args =>
|
||||||
|
{
|
||||||
|
logger?.LogDebug(
|
||||||
|
args.Outcome.Exception,
|
||||||
|
"Retrying MXAccess Gateway client call after transient gRPC failure. Attempt {Attempt}.",
|
||||||
|
args.AttemptNumber + 1);
|
||||||
|
return default;
|
||||||
|
},
|
||||||
|
})
|
||||||
|
.Build();
|
||||||
|
}
|
||||||
|
|
||||||
|
public static bool IsRetryableCommand(MxCommandKind kind)
|
||||||
|
{
|
||||||
|
return kind is MxCommandKind.Ping
|
||||||
|
or MxCommandKind.GetSessionState
|
||||||
|
or MxCommandKind.GetWorkerInfo;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static bool IsTransientGrpcFailure(Exception exception)
|
||||||
|
{
|
||||||
|
return exception switch
|
||||||
|
{
|
||||||
|
RpcException rpcException => IsTransientStatus(rpcException.StatusCode),
|
||||||
|
MxGatewayException { InnerException: RpcException rpcException } => IsTransientStatus(rpcException.StatusCode),
|
||||||
|
_ => false,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
private static bool IsTransientStatus(StatusCode statusCode)
|
||||||
|
{
|
||||||
|
return statusCode is StatusCode.Unavailable
|
||||||
|
or StatusCode.DeadlineExceeded
|
||||||
|
or StatusCode.ResourceExhausted;
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -37,6 +37,19 @@ The default probe only verifies that the worker did not exit immediately. The
|
|||||||
worker client replaces this probe when pipe connection, hello, and
|
worker client replaces this probe when pipe connection, hello, and
|
||||||
`WorkerReady` handling are implemented.
|
`WorkerReady` handling are implemented.
|
||||||
|
|
||||||
|
Startup probing uses a bounded Polly retry policy. The gateway starts the worker
|
||||||
|
process once, then retries only transient startup-probe failures while the
|
||||||
|
process remains alive. The policy is configured by
|
||||||
|
`WorkerOptions.StartupProbeRetryAttempts` and
|
||||||
|
`WorkerOptions.StartupProbeRetryDelayMilliseconds`; the retry counter is
|
||||||
|
recorded as `mxgateway.retries.attempted` with `area=worker_startup`.
|
||||||
|
|
||||||
|
The launcher also passes
|
||||||
|
`MXGATEWAY_WORKER_PIPE_CONNECT_ATTEMPT_TIMEOUT_MS` to the worker process from
|
||||||
|
`WorkerOptions.PipeConnectAttemptTimeoutMilliseconds`. The worker uses that
|
||||||
|
value as the per-attempt named-pipe connect timeout inside its own bounded
|
||||||
|
Polly retry loop.
|
||||||
|
|
||||||
If startup fails or exceeds `WorkerOptions.StartupTimeoutSeconds`, the launcher
|
If startup fails or exceeds `WorkerOptions.StartupTimeoutSeconds`, the launcher
|
||||||
kills the worker process tree, disposes the process handle, disposes the
|
kills the worker process tree, disposes the process handle, disposes the
|
||||||
optional pipe reservation, records a worker kill metric, and reports a
|
optional pipe reservation, records a worker kill metric, and reports a
|
||||||
|
|||||||
@@ -110,10 +110,17 @@ public sealed class MxGatewayClientOptions
|
|||||||
public string? ServerNameOverride { get; init; }
|
public string? ServerNameOverride { get; init; }
|
||||||
public TimeSpan ConnectTimeout { get; init; } = TimeSpan.FromSeconds(10);
|
public TimeSpan ConnectTimeout { get; init; } = TimeSpan.FromSeconds(10);
|
||||||
public TimeSpan DefaultCallTimeout { get; init; } = TimeSpan.FromSeconds(30);
|
public TimeSpan DefaultCallTimeout { get; init; } = TimeSpan.FromSeconds(30);
|
||||||
|
public MxGatewayClientRetryOptions Retry { get; init; } = new();
|
||||||
public ILoggerFactory? LoggerFactory { get; init; }
|
public ILoggerFactory? LoggerFactory { get; init; }
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
|
The .NET client applies a bounded Polly retry policy only to idempotent calls:
|
||||||
|
`CloseSession` and diagnostic `Invoke` commands such as `Ping`,
|
||||||
|
`GetSessionState`, and `GetWorkerInfo`. It does not retry `OpenSession`, event
|
||||||
|
streams, writes, secured writes, authentication, registration, item management,
|
||||||
|
or subscription changes because those calls can partially succeed in MXAccess.
|
||||||
|
|
||||||
API key may be loaded from `MXGATEWAY_API_KEY` by the CLI, not implicitly by the
|
API key may be loaded from `MXGATEWAY_API_KEY` by the CLI, not implicitly by the
|
||||||
library constructor unless a helper explicitly says it does that.
|
library constructor unless a helper explicitly says it does that.
|
||||||
|
|
||||||
|
|||||||
@@ -843,6 +843,9 @@ Suggested configuration shape:
|
|||||||
"Worker": {
|
"Worker": {
|
||||||
"ExecutablePath": "src/MxGateway.Worker/bin/x86/Release/MxGateway.Worker.exe",
|
"ExecutablePath": "src/MxGateway.Worker/bin/x86/Release/MxGateway.Worker.exe",
|
||||||
"StartupTimeoutSeconds": 30,
|
"StartupTimeoutSeconds": 30,
|
||||||
|
"StartupProbeRetryAttempts": 3,
|
||||||
|
"StartupProbeRetryDelayMilliseconds": 250,
|
||||||
|
"PipeConnectAttemptTimeoutMilliseconds": 2000,
|
||||||
"ShutdownTimeoutSeconds": 10,
|
"ShutdownTimeoutSeconds": 10,
|
||||||
"HeartbeatIntervalSeconds": 5,
|
"HeartbeatIntervalSeconds": 5,
|
||||||
"HeartbeatGraceSeconds": 15,
|
"HeartbeatGraceSeconds": 15,
|
||||||
|
|||||||
@@ -80,6 +80,18 @@ public sealed class GatewayOptionsValidator : IValidateOptions<GatewayOptions>
|
|||||||
options.StartupTimeoutSeconds,
|
options.StartupTimeoutSeconds,
|
||||||
"MxGateway:Worker:StartupTimeoutSeconds must be greater than zero.",
|
"MxGateway:Worker:StartupTimeoutSeconds must be greater than zero.",
|
||||||
failures);
|
failures);
|
||||||
|
AddIfNotPositive(
|
||||||
|
options.StartupProbeRetryAttempts,
|
||||||
|
"MxGateway:Worker:StartupProbeRetryAttempts must be greater than zero.",
|
||||||
|
failures);
|
||||||
|
AddIfNotPositive(
|
||||||
|
options.StartupProbeRetryDelayMilliseconds,
|
||||||
|
"MxGateway:Worker:StartupProbeRetryDelayMilliseconds must be greater than zero.",
|
||||||
|
failures);
|
||||||
|
AddIfNotPositive(
|
||||||
|
options.PipeConnectAttemptTimeoutMilliseconds,
|
||||||
|
"MxGateway:Worker:PipeConnectAttemptTimeoutMilliseconds must be greater than zero.",
|
||||||
|
failures);
|
||||||
AddIfNotPositive(
|
AddIfNotPositive(
|
||||||
options.ShutdownTimeoutSeconds,
|
options.ShutdownTimeoutSeconds,
|
||||||
"MxGateway:Worker:ShutdownTimeoutSeconds must be greater than zero.",
|
"MxGateway:Worker:ShutdownTimeoutSeconds must be greater than zero.",
|
||||||
|
|||||||
@@ -11,6 +11,12 @@ public sealed class WorkerOptions
|
|||||||
|
|
||||||
public int StartupTimeoutSeconds { get; init; } = 30;
|
public int StartupTimeoutSeconds { get; init; } = 30;
|
||||||
|
|
||||||
|
public int StartupProbeRetryAttempts { get; init; } = 3;
|
||||||
|
|
||||||
|
public int StartupProbeRetryDelayMilliseconds { get; init; } = 250;
|
||||||
|
|
||||||
|
public int PipeConnectAttemptTimeoutMilliseconds { get; init; } = 2000;
|
||||||
|
|
||||||
public int ShutdownTimeoutSeconds { get; init; } = 10;
|
public int ShutdownTimeoutSeconds { get; init; } = 10;
|
||||||
|
|
||||||
public int HeartbeatIntervalSeconds { get; init; } = 5;
|
public int HeartbeatIntervalSeconds { get; init; } = 5;
|
||||||
|
|||||||
@@ -20,11 +20,13 @@ public sealed class GatewayMetrics : IDisposable
|
|||||||
private readonly Counter<long> _workerExitsCounter;
|
private readonly Counter<long> _workerExitsCounter;
|
||||||
private readonly Counter<long> _heartbeatFailuresCounter;
|
private readonly Counter<long> _heartbeatFailuresCounter;
|
||||||
private readonly Counter<long> _streamDisconnectsCounter;
|
private readonly Counter<long> _streamDisconnectsCounter;
|
||||||
|
private readonly Counter<long> _retryAttemptsCounter;
|
||||||
private readonly Histogram<double> _workerStartupLatencyHistogram;
|
private readonly Histogram<double> _workerStartupLatencyHistogram;
|
||||||
private readonly Histogram<double> _commandLatencyHistogram;
|
private readonly Histogram<double> _commandLatencyHistogram;
|
||||||
private readonly Histogram<double> _eventStreamSendLatencyHistogram;
|
private readonly Histogram<double> _eventStreamSendLatencyHistogram;
|
||||||
private readonly Dictionary<string, long> _commandFailuresByMethod = new(StringComparer.OrdinalIgnoreCase);
|
private readonly Dictionary<string, long> _commandFailuresByMethod = new(StringComparer.OrdinalIgnoreCase);
|
||||||
private readonly Dictionary<string, long> _eventsByFamily = new(StringComparer.OrdinalIgnoreCase);
|
private readonly Dictionary<string, long> _eventsByFamily = new(StringComparer.OrdinalIgnoreCase);
|
||||||
|
private readonly Dictionary<string, long> _retryAttemptsByArea = new(StringComparer.OrdinalIgnoreCase);
|
||||||
|
|
||||||
private int _openSessions;
|
private int _openSessions;
|
||||||
private int _workersRunning;
|
private int _workersRunning;
|
||||||
@@ -41,6 +43,7 @@ public sealed class GatewayMetrics : IDisposable
|
|||||||
private long _workerExits;
|
private long _workerExits;
|
||||||
private long _heartbeatFailures;
|
private long _heartbeatFailures;
|
||||||
private long _streamDisconnects;
|
private long _streamDisconnects;
|
||||||
|
private long _retryAttempts;
|
||||||
private bool _disposed;
|
private bool _disposed;
|
||||||
|
|
||||||
public GatewayMetrics()
|
public GatewayMetrics()
|
||||||
@@ -58,6 +61,7 @@ public sealed class GatewayMetrics : IDisposable
|
|||||||
_workerExitsCounter = _meter.CreateCounter<long>("mxgateway.workers.exited");
|
_workerExitsCounter = _meter.CreateCounter<long>("mxgateway.workers.exited");
|
||||||
_heartbeatFailuresCounter = _meter.CreateCounter<long>("mxgateway.heartbeats.failed");
|
_heartbeatFailuresCounter = _meter.CreateCounter<long>("mxgateway.heartbeats.failed");
|
||||||
_streamDisconnectsCounter = _meter.CreateCounter<long>("mxgateway.grpc.streams.disconnected");
|
_streamDisconnectsCounter = _meter.CreateCounter<long>("mxgateway.grpc.streams.disconnected");
|
||||||
|
_retryAttemptsCounter = _meter.CreateCounter<long>("mxgateway.retries.attempted");
|
||||||
_workerStartupLatencyHistogram = _meter.CreateHistogram<double>("mxgateway.workers.startup.duration", "ms");
|
_workerStartupLatencyHistogram = _meter.CreateHistogram<double>("mxgateway.workers.startup.duration", "ms");
|
||||||
_commandLatencyHistogram = _meter.CreateHistogram<double>("mxgateway.commands.duration", "ms");
|
_commandLatencyHistogram = _meter.CreateHistogram<double>("mxgateway.commands.duration", "ms");
|
||||||
_eventStreamSendLatencyHistogram = _meter.CreateHistogram<double>("mxgateway.events.stream_send.duration", "ms");
|
_eventStreamSendLatencyHistogram = _meter.CreateHistogram<double>("mxgateway.events.stream_send.duration", "ms");
|
||||||
@@ -238,6 +242,17 @@ public sealed class GatewayMetrics : IDisposable
|
|||||||
_streamDisconnectsCounter.Add(1, new KeyValuePair<string, object?>("reason", reason));
|
_streamDisconnectsCounter.Add(1, new KeyValuePair<string, object?>("reason", reason));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void RetryAttempted(string area)
|
||||||
|
{
|
||||||
|
lock (_syncRoot)
|
||||||
|
{
|
||||||
|
_retryAttempts++;
|
||||||
|
Increment(_retryAttemptsByArea, area);
|
||||||
|
}
|
||||||
|
|
||||||
|
_retryAttemptsCounter.Add(1, new KeyValuePair<string, object?>("area", area));
|
||||||
|
}
|
||||||
|
|
||||||
public GatewayMetricsSnapshot GetSnapshot()
|
public GatewayMetricsSnapshot GetSnapshot()
|
||||||
{
|
{
|
||||||
lock (_syncRoot)
|
lock (_syncRoot)
|
||||||
@@ -258,8 +273,10 @@ public sealed class GatewayMetrics : IDisposable
|
|||||||
WorkerExits: _workerExits,
|
WorkerExits: _workerExits,
|
||||||
HeartbeatFailures: _heartbeatFailures,
|
HeartbeatFailures: _heartbeatFailures,
|
||||||
StreamDisconnects: _streamDisconnects,
|
StreamDisconnects: _streamDisconnects,
|
||||||
|
RetryAttempts: _retryAttempts,
|
||||||
CommandFailuresByMethod: new Dictionary<string, long>(_commandFailuresByMethod, StringComparer.OrdinalIgnoreCase),
|
CommandFailuresByMethod: new Dictionary<string, long>(_commandFailuresByMethod, StringComparer.OrdinalIgnoreCase),
|
||||||
EventsByFamily: new Dictionary<string, long>(_eventsByFamily, StringComparer.OrdinalIgnoreCase));
|
EventsByFamily: new Dictionary<string, long>(_eventsByFamily, StringComparer.OrdinalIgnoreCase),
|
||||||
|
RetryAttemptsByArea: new Dictionary<string, long>(_retryAttemptsByArea, StringComparer.OrdinalIgnoreCase));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -16,5 +16,7 @@ public sealed record GatewayMetricsSnapshot(
|
|||||||
long WorkerExits,
|
long WorkerExits,
|
||||||
long HeartbeatFailures,
|
long HeartbeatFailures,
|
||||||
long StreamDisconnects,
|
long StreamDisconnects,
|
||||||
|
long RetryAttempts,
|
||||||
IReadOnlyDictionary<string, long> CommandFailuresByMethod,
|
IReadOnlyDictionary<string, long> CommandFailuresByMethod,
|
||||||
IReadOnlyDictionary<string, long> EventsByFamily);
|
IReadOnlyDictionary<string, long> EventsByFamily,
|
||||||
|
IReadOnlyDictionary<string, long> RetryAttemptsByArea);
|
||||||
|
|||||||
@@ -7,6 +7,7 @@
|
|||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
<PackageReference Include="Grpc.AspNetCore" Version="2.76.0" />
|
<PackageReference Include="Grpc.AspNetCore" Version="2.76.0" />
|
||||||
<PackageReference Include="Microsoft.Data.Sqlite" Version="10.0.7" />
|
<PackageReference Include="Microsoft.Data.Sqlite" Version="10.0.7" />
|
||||||
|
<PackageReference Include="Polly.Core" Version="8.6.6" />
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
|
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
|
|||||||
@@ -1,25 +1,33 @@
|
|||||||
using System.Diagnostics;
|
using System.Diagnostics;
|
||||||
|
using Microsoft.Extensions.Logging;
|
||||||
|
using Microsoft.Extensions.Logging.Abstractions;
|
||||||
using Microsoft.Extensions.Options;
|
using Microsoft.Extensions.Options;
|
||||||
using MxGateway.Server.Configuration;
|
using MxGateway.Server.Configuration;
|
||||||
using MxGateway.Server.Metrics;
|
using MxGateway.Server.Metrics;
|
||||||
|
using Polly;
|
||||||
|
using Polly.Retry;
|
||||||
|
|
||||||
namespace MxGateway.Server.Workers;
|
namespace MxGateway.Server.Workers;
|
||||||
|
|
||||||
public sealed class WorkerProcessLauncher : IWorkerProcessLauncher
|
public sealed class WorkerProcessLauncher : IWorkerProcessLauncher
|
||||||
{
|
{
|
||||||
public const string WorkerNonceEnvironmentVariableName = "MXGATEWAY_WORKER_NONCE";
|
public const string WorkerNonceEnvironmentVariableName = "MXGATEWAY_WORKER_NONCE";
|
||||||
|
public const string WorkerPipeConnectAttemptTimeoutEnvironmentVariableName =
|
||||||
|
"MXGATEWAY_WORKER_PIPE_CONNECT_ATTEMPT_TIMEOUT_MS";
|
||||||
|
|
||||||
private readonly IWorkerProcessFactory _processFactory;
|
private readonly IWorkerProcessFactory _processFactory;
|
||||||
private readonly IWorkerStartupProbe _startupProbe;
|
private readonly IWorkerStartupProbe _startupProbe;
|
||||||
private readonly GatewayMetrics _metrics;
|
private readonly GatewayMetrics _metrics;
|
||||||
private readonly TimeProvider _timeProvider;
|
private readonly TimeProvider _timeProvider;
|
||||||
private readonly WorkerOptions _workerOptions;
|
private readonly WorkerOptions _workerOptions;
|
||||||
|
private readonly ILogger<WorkerProcessLauncher> _logger;
|
||||||
|
|
||||||
public WorkerProcessLauncher(
|
public WorkerProcessLauncher(
|
||||||
IOptions<GatewayOptions> gatewayOptions,
|
IOptions<GatewayOptions> gatewayOptions,
|
||||||
IWorkerProcessFactory processFactory,
|
IWorkerProcessFactory processFactory,
|
||||||
IWorkerStartupProbe startupProbe,
|
IWorkerStartupProbe startupProbe,
|
||||||
GatewayMetrics metrics,
|
GatewayMetrics metrics,
|
||||||
|
ILogger<WorkerProcessLauncher>? logger = null,
|
||||||
TimeProvider? timeProvider = null)
|
TimeProvider? timeProvider = null)
|
||||||
{
|
{
|
||||||
ArgumentNullException.ThrowIfNull(gatewayOptions);
|
ArgumentNullException.ThrowIfNull(gatewayOptions);
|
||||||
@@ -32,6 +40,7 @@ public sealed class WorkerProcessLauncher : IWorkerProcessLauncher
|
|||||||
_startupProbe = startupProbe;
|
_startupProbe = startupProbe;
|
||||||
_metrics = metrics;
|
_metrics = metrics;
|
||||||
_timeProvider = timeProvider ?? TimeProvider.System;
|
_timeProvider = timeProvider ?? TimeProvider.System;
|
||||||
|
_logger = logger ?? NullLogger<WorkerProcessLauncher>.Instance;
|
||||||
}
|
}
|
||||||
|
|
||||||
public async Task<WorkerProcessHandle> LaunchAsync(
|
public async Task<WorkerProcessHandle> LaunchAsync(
|
||||||
@@ -76,8 +85,15 @@ public sealed class WorkerProcessLauncher : IWorkerProcessLauncher
|
|||||||
using CancellationTokenSource startupTimeout = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken);
|
using CancellationTokenSource startupTimeout = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken);
|
||||||
startupTimeout.CancelAfter(TimeSpan.FromSeconds(_workerOptions.StartupTimeoutSeconds));
|
startupTimeout.CancelAfter(TimeSpan.FromSeconds(_workerOptions.StartupTimeoutSeconds));
|
||||||
|
|
||||||
await _startupProbe
|
await CreateStartupProbePipeline(process)
|
||||||
.WaitUntilReadyAsync(process, request, startupTimeout.Token)
|
.ExecuteAsync(
|
||||||
|
async token =>
|
||||||
|
{
|
||||||
|
await _startupProbe
|
||||||
|
.WaitUntilReadyAsync(process, request, token)
|
||||||
|
.ConfigureAwait(false);
|
||||||
|
},
|
||||||
|
startupTimeout.Token)
|
||||||
.ConfigureAwait(false);
|
.ConfigureAwait(false);
|
||||||
|
|
||||||
_metrics.WorkerStarted(_timeProvider.GetUtcNow() - startedAt);
|
_metrics.WorkerStarted(_timeProvider.GetUtcNow() - startedAt);
|
||||||
@@ -143,6 +159,8 @@ public sealed class WorkerProcessLauncher : IWorkerProcessLauncher
|
|||||||
}
|
}
|
||||||
|
|
||||||
startInfo.Environment[WorkerNonceEnvironmentVariableName] = request.Nonce;
|
startInfo.Environment[WorkerNonceEnvironmentVariableName] = request.Nonce;
|
||||||
|
startInfo.Environment[WorkerPipeConnectAttemptTimeoutEnvironmentVariableName] =
|
||||||
|
_workerOptions.PipeConnectAttemptTimeoutMilliseconds.ToString(System.Globalization.CultureInfo.InvariantCulture);
|
||||||
|
|
||||||
commandLine = new WorkerProcessCommandLine(executablePath, arguments);
|
commandLine = new WorkerProcessCommandLine(executablePath, arguments);
|
||||||
|
|
||||||
@@ -229,6 +247,43 @@ public sealed class WorkerProcessLauncher : IWorkerProcessLauncher
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private ResiliencePipeline CreateStartupProbePipeline(IWorkerProcess process)
|
||||||
|
{
|
||||||
|
RetryStrategyOptions retryOptions = new()
|
||||||
|
{
|
||||||
|
MaxRetryAttempts = Math.Max(0, _workerOptions.StartupProbeRetryAttempts - 1),
|
||||||
|
BackoffType = DelayBackoffType.Exponential,
|
||||||
|
UseJitter = true,
|
||||||
|
Delay = TimeSpan.FromMilliseconds(_workerOptions.StartupProbeRetryDelayMilliseconds),
|
||||||
|
MaxDelay = TimeSpan.FromSeconds(2),
|
||||||
|
ShouldHandle = new PredicateBuilder().Handle<Exception>(exception =>
|
||||||
|
ShouldRetryStartupProbe(exception, process)),
|
||||||
|
OnRetry = args =>
|
||||||
|
{
|
||||||
|
_metrics.RetryAttempted("worker_startup");
|
||||||
|
_logger.LogDebug(
|
||||||
|
args.Outcome.Exception,
|
||||||
|
"Retrying worker startup probe after transient failure. Attempt {Attempt}.",
|
||||||
|
args.AttemptNumber + 1);
|
||||||
|
return default;
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
return new ResiliencePipelineBuilder()
|
||||||
|
.AddRetry(retryOptions)
|
||||||
|
.Build();
|
||||||
|
}
|
||||||
|
|
||||||
|
private static bool ShouldRetryStartupProbe(Exception exception, IWorkerProcess process)
|
||||||
|
{
|
||||||
|
if (exception is OperationCanceledException or WorkerProcessLaunchException)
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return !process.HasExited;
|
||||||
|
}
|
||||||
|
|
||||||
private static void ValidateRequest(WorkerProcessLaunchRequest request)
|
private static void ValidateRequest(WorkerProcessLaunchRequest request)
|
||||||
{
|
{
|
||||||
if (string.IsNullOrWhiteSpace(request.SessionId))
|
if (string.IsNullOrWhiteSpace(request.SessionId))
|
||||||
|
|||||||
@@ -20,6 +20,9 @@ public sealed class GatewayOptionsTests
|
|||||||
Assert.Equal(@"src\MxGateway.Worker\bin\x86\Release\MxGateway.Worker.exe", options.Worker.ExecutablePath);
|
Assert.Equal(@"src\MxGateway.Worker\bin\x86\Release\MxGateway.Worker.exe", options.Worker.ExecutablePath);
|
||||||
Assert.Equal(WorkerArchitecture.X86, options.Worker.RequiredArchitecture);
|
Assert.Equal(WorkerArchitecture.X86, options.Worker.RequiredArchitecture);
|
||||||
Assert.Equal(30, options.Worker.StartupTimeoutSeconds);
|
Assert.Equal(30, options.Worker.StartupTimeoutSeconds);
|
||||||
|
Assert.Equal(3, options.Worker.StartupProbeRetryAttempts);
|
||||||
|
Assert.Equal(250, options.Worker.StartupProbeRetryDelayMilliseconds);
|
||||||
|
Assert.Equal(2000, options.Worker.PipeConnectAttemptTimeoutMilliseconds);
|
||||||
Assert.Equal(10, options.Worker.ShutdownTimeoutSeconds);
|
Assert.Equal(10, options.Worker.ShutdownTimeoutSeconds);
|
||||||
Assert.Equal(5, options.Worker.HeartbeatIntervalSeconds);
|
Assert.Equal(5, options.Worker.HeartbeatIntervalSeconds);
|
||||||
Assert.Equal(15, options.Worker.HeartbeatGraceSeconds);
|
Assert.Equal(15, options.Worker.HeartbeatGraceSeconds);
|
||||||
@@ -66,6 +69,8 @@ public sealed class GatewayOptionsTests
|
|||||||
|
|
||||||
[Theory]
|
[Theory]
|
||||||
[InlineData("MxGateway:Worker:ExecutablePath", "worker.dll", "MxGateway:Worker:ExecutablePath must point to a .exe file.")]
|
[InlineData("MxGateway:Worker:ExecutablePath", "worker.dll", "MxGateway:Worker:ExecutablePath must point to a .exe file.")]
|
||||||
|
[InlineData("MxGateway:Worker:StartupProbeRetryAttempts", "0", "MxGateway:Worker:StartupProbeRetryAttempts must be greater than zero.")]
|
||||||
|
[InlineData("MxGateway:Worker:PipeConnectAttemptTimeoutMilliseconds", "0", "MxGateway:Worker:PipeConnectAttemptTimeoutMilliseconds must be greater than zero.")]
|
||||||
[InlineData("MxGateway:Events:QueueCapacity", "0", "MxGateway:Events:QueueCapacity must be greater than zero.")]
|
[InlineData("MxGateway:Events:QueueCapacity", "0", "MxGateway:Events:QueueCapacity must be greater than zero.")]
|
||||||
[InlineData("MxGateway:Authentication:PepperSecretName", "", "MxGateway:Authentication:PepperSecretName is required")]
|
[InlineData("MxGateway:Authentication:PepperSecretName", "", "MxGateway:Authentication:PepperSecretName is required")]
|
||||||
[InlineData("MxGateway:Dashboard:PathBase", "dashboard", "MxGateway:Dashboard:PathBase must start with '/'.")]
|
[InlineData("MxGateway:Dashboard:PathBase", "dashboard", "MxGateway:Dashboard:PathBase must start with '/'.")]
|
||||||
|
|||||||
@@ -36,6 +36,10 @@ public sealed class WorkerProcessLauncherTests
|
|||||||
["--session-id", SessionId, "--pipe-name", PipeName, "--protocol-version", "1"],
|
["--session-id", SessionId, "--pipe-name", PipeName, "--protocol-version", "1"],
|
||||||
processFactory.LastStartInfo.ArgumentList);
|
processFactory.LastStartInfo.ArgumentList);
|
||||||
Assert.Equal(Nonce, processFactory.LastStartInfo.Environment[WorkerProcessLauncher.WorkerNonceEnvironmentVariableName]);
|
Assert.Equal(Nonce, processFactory.LastStartInfo.Environment[WorkerProcessLauncher.WorkerNonceEnvironmentVariableName]);
|
||||||
|
Assert.Equal(
|
||||||
|
"2000",
|
||||||
|
processFactory.LastStartInfo.Environment[
|
||||||
|
WorkerProcessLauncher.WorkerPipeConnectAttemptTimeoutEnvironmentVariableName]);
|
||||||
Assert.DoesNotContain(Nonce, handle.CommandLine.ToString(), StringComparison.Ordinal);
|
Assert.DoesNotContain(Nonce, handle.CommandLine.ToString(), StringComparison.Ordinal);
|
||||||
Assert.DoesNotContain(Nonce, string.Join(" ", handle.CommandLine.Arguments), StringComparison.Ordinal);
|
Assert.DoesNotContain(Nonce, string.Join(" ", handle.CommandLine.Arguments), StringComparison.Ordinal);
|
||||||
Assert.False(pipeReservation.DisposeCalled);
|
Assert.False(pipeReservation.DisposeCalled);
|
||||||
@@ -67,6 +71,32 @@ public sealed class WorkerProcessLauncherTests
|
|||||||
Assert.Equal(1, metrics.GetSnapshot().WorkerKills);
|
Assert.Equal(1, metrics.GetSnapshot().WorkerKills);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public async Task LaunchAsync_WhenStartupProbeFailsTransiently_RetriesWithoutRespawningWorker()
|
||||||
|
{
|
||||||
|
using TestDirectory directory = TestDirectory.Create();
|
||||||
|
string executablePath = directory.CreateWorkerExecutable(machine: 0x014c);
|
||||||
|
FakeWorkerProcess process = new(processId: 1234);
|
||||||
|
FakeWorkerProcessFactory processFactory = new(process);
|
||||||
|
GatewayMetrics metrics = new();
|
||||||
|
WorkerProcessLauncher launcher = CreateLauncher(
|
||||||
|
executablePath,
|
||||||
|
processFactory,
|
||||||
|
new TransientStartupProbe(failuresBeforeSuccess: 1),
|
||||||
|
metrics,
|
||||||
|
startupProbeRetryAttempts: 2,
|
||||||
|
startupProbeRetryDelayMilliseconds: 1);
|
||||||
|
|
||||||
|
using WorkerProcessHandle handle = await launcher.LaunchAsync(CreateRequest());
|
||||||
|
|
||||||
|
Assert.Same(process, handle.Process);
|
||||||
|
Assert.Equal(1, processFactory.StartCount);
|
||||||
|
Assert.False(process.KillCalled);
|
||||||
|
GatewayMetricsSnapshot snapshot = metrics.GetSnapshot();
|
||||||
|
Assert.Equal(1, snapshot.RetryAttempts);
|
||||||
|
Assert.Equal(1, snapshot.RetryAttemptsByArea["worker_startup"]);
|
||||||
|
}
|
||||||
|
|
||||||
[Fact]
|
[Fact]
|
||||||
public async Task LaunchAsync_WhenStartupTimesOut_KillsAndDisposesWorker()
|
public async Task LaunchAsync_WhenStartupTimesOut_KillsAndDisposesWorker()
|
||||||
{
|
{
|
||||||
@@ -152,7 +182,9 @@ public sealed class WorkerProcessLauncherTests
|
|||||||
IWorkerProcessFactory processFactory,
|
IWorkerProcessFactory processFactory,
|
||||||
IWorkerStartupProbe startupProbe,
|
IWorkerStartupProbe startupProbe,
|
||||||
GatewayMetrics? metrics = null,
|
GatewayMetrics? metrics = null,
|
||||||
int startupTimeoutSeconds = 30)
|
int startupTimeoutSeconds = 30,
|
||||||
|
int startupProbeRetryAttempts = 3,
|
||||||
|
int startupProbeRetryDelayMilliseconds = 250)
|
||||||
{
|
{
|
||||||
GatewayOptions options = new()
|
GatewayOptions options = new()
|
||||||
{
|
{
|
||||||
@@ -161,6 +193,8 @@ public sealed class WorkerProcessLauncherTests
|
|||||||
ExecutablePath = executablePath,
|
ExecutablePath = executablePath,
|
||||||
RequiredArchitecture = WorkerArchitecture.X86,
|
RequiredArchitecture = WorkerArchitecture.X86,
|
||||||
StartupTimeoutSeconds = startupTimeoutSeconds,
|
StartupTimeoutSeconds = startupTimeoutSeconds,
|
||||||
|
StartupProbeRetryAttempts = startupProbeRetryAttempts,
|
||||||
|
StartupProbeRetryDelayMilliseconds = startupProbeRetryDelayMilliseconds,
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -185,8 +219,11 @@ public sealed class WorkerProcessLauncherTests
|
|||||||
{
|
{
|
||||||
public ProcessStartInfo? LastStartInfo { get; private set; }
|
public ProcessStartInfo? LastStartInfo { get; private set; }
|
||||||
|
|
||||||
|
public int StartCount { get; private set; }
|
||||||
|
|
||||||
public IWorkerProcess Start(ProcessStartInfo startInfo)
|
public IWorkerProcess Start(ProcessStartInfo startInfo)
|
||||||
{
|
{
|
||||||
|
StartCount++;
|
||||||
LastStartInfo = startInfo;
|
LastStartInfo = startInfo;
|
||||||
return process;
|
return process;
|
||||||
}
|
}
|
||||||
@@ -255,6 +292,24 @@ public sealed class WorkerProcessLauncherTests
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private sealed class TransientStartupProbe(int failuresBeforeSuccess) : IWorkerStartupProbe
|
||||||
|
{
|
||||||
|
private int _attempts;
|
||||||
|
|
||||||
|
public Task WaitUntilReadyAsync(
|
||||||
|
IWorkerProcess process,
|
||||||
|
WorkerProcessLaunchRequest request,
|
||||||
|
CancellationToken cancellationToken)
|
||||||
|
{
|
||||||
|
if (Interlocked.Increment(ref _attempts) <= failuresBeforeSuccess)
|
||||||
|
{
|
||||||
|
throw new IOException("The worker pipe was not ready yet.");
|
||||||
|
}
|
||||||
|
|
||||||
|
return Task.CompletedTask;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private sealed class FakePipeReservation : IDisposable
|
private sealed class FakePipeReservation : IDisposable
|
||||||
{
|
{
|
||||||
public bool DisposeCalled { get; private set; }
|
public bool DisposeCalled { get; private set; }
|
||||||
|
|||||||
@@ -80,6 +80,65 @@ public sealed class WorkerPipeClientTests
|
|||||||
await clientTask;
|
await clientTask;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public async Task RunAsync_RetriesUntilPipeServerAppears()
|
||||||
|
{
|
||||||
|
string pipeName = $"mxaccess-gateway-test-{Guid.NewGuid():N}";
|
||||||
|
WorkerOptions workerOptions = new(
|
||||||
|
"session-1",
|
||||||
|
pipeName,
|
||||||
|
GatewayContractInfo.WorkerProtocolVersion,
|
||||||
|
"nonce-secret");
|
||||||
|
WorkerFrameProtocolOptions frameOptions = new(workerOptions);
|
||||||
|
|
||||||
|
WorkerPipeClient client = new(
|
||||||
|
logger: null,
|
||||||
|
connectTimeoutMilliseconds: 1000,
|
||||||
|
connectAttemptTimeoutMilliseconds: 50,
|
||||||
|
(stream, options, _) => CreateSession(stream, options));
|
||||||
|
Task clientTask = client.RunAsync(workerOptions);
|
||||||
|
|
||||||
|
await Task.Delay(150);
|
||||||
|
|
||||||
|
using NamedPipeServerStream server = new(
|
||||||
|
pipeName,
|
||||||
|
PipeDirection.InOut,
|
||||||
|
1,
|
||||||
|
PipeTransmissionMode.Byte,
|
||||||
|
PipeOptions.Asynchronous);
|
||||||
|
|
||||||
|
await Task.Factory.FromAsync(server.BeginWaitForConnection, server.EndWaitForConnection, null);
|
||||||
|
|
||||||
|
WorkerFrameReader reader = new(server, frameOptions);
|
||||||
|
WorkerFrameWriter writer = new(server, frameOptions);
|
||||||
|
|
||||||
|
await writer.WriteAsync(CreateGatewayHello());
|
||||||
|
Assert.Equal(WorkerEnvelope.BodyOneofCase.WorkerHello, (await reader.ReadAsync()).BodyCase);
|
||||||
|
Assert.Equal(WorkerEnvelope.BodyOneofCase.WorkerReady, (await reader.ReadAsync()).BodyCase);
|
||||||
|
await writer.WriteAsync(CreateShutdown());
|
||||||
|
|
||||||
|
Assert.Equal(WorkerEnvelope.BodyOneofCase.WorkerShutdownAck, (await reader.ReadAsync()).BodyCase);
|
||||||
|
await clientTask;
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public async Task RunAsync_WhenPipeNeverAppears_ThrowsTimeoutException()
|
||||||
|
{
|
||||||
|
WorkerOptions workerOptions = new(
|
||||||
|
"session-1",
|
||||||
|
$"mxaccess-gateway-test-{Guid.NewGuid():N}",
|
||||||
|
GatewayContractInfo.WorkerProtocolVersion,
|
||||||
|
"nonce-secret");
|
||||||
|
|
||||||
|
WorkerPipeClient client = new(
|
||||||
|
logger: null,
|
||||||
|
connectTimeoutMilliseconds: 100,
|
||||||
|
connectAttemptTimeoutMilliseconds: 50,
|
||||||
|
(stream, options, _) => CreateSession(stream, options));
|
||||||
|
|
||||||
|
await Assert.ThrowsAsync<TimeoutException>(async () => await client.RunAsync(workerOptions));
|
||||||
|
}
|
||||||
|
|
||||||
private static WorkerPipeSession CreateSession(
|
private static WorkerPipeSession CreateSession(
|
||||||
Stream stream,
|
Stream stream,
|
||||||
WorkerFrameProtocolOptions options)
|
WorkerFrameProtocolOptions options)
|
||||||
@@ -97,6 +156,37 @@ public sealed class WorkerPipeClientTests
|
|||||||
() => new FakeRuntimeSession());
|
() => new FakeRuntimeSession());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static WorkerEnvelope CreateGatewayHello()
|
||||||
|
{
|
||||||
|
return new WorkerEnvelope
|
||||||
|
{
|
||||||
|
ProtocolVersion = GatewayContractInfo.WorkerProtocolVersion,
|
||||||
|
SessionId = "session-1",
|
||||||
|
Sequence = 1,
|
||||||
|
GatewayHello = new GatewayHello
|
||||||
|
{
|
||||||
|
SupportedProtocolVersion = GatewayContractInfo.WorkerProtocolVersion,
|
||||||
|
Nonce = "nonce-secret",
|
||||||
|
GatewayVersion = "test-gateway",
|
||||||
|
},
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
private static WorkerEnvelope CreateShutdown()
|
||||||
|
{
|
||||||
|
return new WorkerEnvelope
|
||||||
|
{
|
||||||
|
ProtocolVersion = GatewayContractInfo.WorkerProtocolVersion,
|
||||||
|
SessionId = "session-1",
|
||||||
|
Sequence = 2,
|
||||||
|
WorkerShutdown = new WorkerShutdown
|
||||||
|
{
|
||||||
|
GracePeriod = Duration.FromTimeSpan(TimeSpan.FromSeconds(1)),
|
||||||
|
Reason = "test-complete",
|
||||||
|
},
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
private sealed class FakeRuntimeSession : IWorkerRuntimeSession
|
private sealed class FakeRuntimeSession : IWorkerRuntimeSession
|
||||||
{
|
{
|
||||||
public Task<WorkerReady> StartAsync(
|
public Task<WorkerReady> StartAsync(
|
||||||
|
|||||||
@@ -1,17 +1,24 @@
|
|||||||
using System;
|
using System;
|
||||||
|
using System.Collections.Generic;
|
||||||
using System.IO;
|
using System.IO;
|
||||||
using System.IO.Pipes;
|
using System.IO.Pipes;
|
||||||
using System.Threading;
|
using System.Threading;
|
||||||
using System.Threading.Tasks;
|
using System.Threading.Tasks;
|
||||||
using MxGateway.Worker.Bootstrap;
|
using MxGateway.Worker.Bootstrap;
|
||||||
|
using Polly;
|
||||||
|
using Polly.Retry;
|
||||||
|
|
||||||
namespace MxGateway.Worker.Ipc;
|
namespace MxGateway.Worker.Ipc;
|
||||||
|
|
||||||
public sealed class WorkerPipeClient : IWorkerPipeClient
|
public sealed class WorkerPipeClient : IWorkerPipeClient
|
||||||
{
|
{
|
||||||
public const int DefaultConnectTimeoutMilliseconds = 30000;
|
public const int DefaultConnectTimeoutMilliseconds = 30000;
|
||||||
|
public const int DefaultConnectAttemptTimeoutMilliseconds = 2000;
|
||||||
|
public const string ConnectAttemptTimeoutEnvironmentVariableName =
|
||||||
|
"MXGATEWAY_WORKER_PIPE_CONNECT_ATTEMPT_TIMEOUT_MS";
|
||||||
|
|
||||||
private readonly int _connectTimeoutMilliseconds;
|
private readonly int _connectTimeoutMilliseconds;
|
||||||
|
private readonly int _connectAttemptTimeoutMilliseconds;
|
||||||
private readonly Func<Stream, WorkerFrameProtocolOptions, IWorkerLogger?, WorkerPipeSession> _sessionFactory;
|
private readonly Func<Stream, WorkerFrameProtocolOptions, IWorkerLogger?, WorkerPipeSession> _sessionFactory;
|
||||||
private readonly IWorkerLogger? _logger;
|
private readonly IWorkerLogger? _logger;
|
||||||
|
|
||||||
@@ -36,6 +43,7 @@ public sealed class WorkerPipeClient : IWorkerPipeClient
|
|||||||
: this(
|
: this(
|
||||||
null,
|
null,
|
||||||
connectTimeoutMilliseconds,
|
connectTimeoutMilliseconds,
|
||||||
|
ResolveDefaultConnectAttemptTimeoutMilliseconds(),
|
||||||
(stream, frameOptions, _) => sessionFactory(stream, frameOptions))
|
(stream, frameOptions, _) => sessionFactory(stream, frameOptions))
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
@@ -46,6 +54,7 @@ public sealed class WorkerPipeClient : IWorkerPipeClient
|
|||||||
: this(
|
: this(
|
||||||
logger,
|
logger,
|
||||||
connectTimeoutMilliseconds,
|
connectTimeoutMilliseconds,
|
||||||
|
ResolveDefaultConnectAttemptTimeoutMilliseconds(),
|
||||||
(stream, frameOptions, workerLogger) => new WorkerPipeSession(stream, frameOptions, workerLogger))
|
(stream, frameOptions, workerLogger) => new WorkerPipeSession(stream, frameOptions, workerLogger))
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
@@ -54,6 +63,19 @@ public sealed class WorkerPipeClient : IWorkerPipeClient
|
|||||||
IWorkerLogger? logger,
|
IWorkerLogger? logger,
|
||||||
int connectTimeoutMilliseconds,
|
int connectTimeoutMilliseconds,
|
||||||
Func<Stream, WorkerFrameProtocolOptions, IWorkerLogger?, WorkerPipeSession> sessionFactory)
|
Func<Stream, WorkerFrameProtocolOptions, IWorkerLogger?, WorkerPipeSession> sessionFactory)
|
||||||
|
: this(
|
||||||
|
logger,
|
||||||
|
connectTimeoutMilliseconds,
|
||||||
|
ResolveDefaultConnectAttemptTimeoutMilliseconds(),
|
||||||
|
sessionFactory)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
public WorkerPipeClient(
|
||||||
|
IWorkerLogger? logger,
|
||||||
|
int connectTimeoutMilliseconds,
|
||||||
|
int connectAttemptTimeoutMilliseconds,
|
||||||
|
Func<Stream, WorkerFrameProtocolOptions, IWorkerLogger?, WorkerPipeSession> sessionFactory)
|
||||||
{
|
{
|
||||||
if (connectTimeoutMilliseconds <= 0)
|
if (connectTimeoutMilliseconds <= 0)
|
||||||
{
|
{
|
||||||
@@ -62,9 +84,17 @@ public sealed class WorkerPipeClient : IWorkerPipeClient
|
|||||||
"Worker pipe connect timeout must be greater than zero.");
|
"Worker pipe connect timeout must be greater than zero.");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (connectAttemptTimeoutMilliseconds <= 0)
|
||||||
|
{
|
||||||
|
throw new ArgumentOutOfRangeException(
|
||||||
|
nameof(connectAttemptTimeoutMilliseconds),
|
||||||
|
"Worker pipe connect attempt timeout must be greater than zero.");
|
||||||
|
}
|
||||||
|
|
||||||
_logger = logger;
|
_logger = logger;
|
||||||
_sessionFactory = sessionFactory ?? throw new ArgumentNullException(nameof(sessionFactory));
|
_sessionFactory = sessionFactory ?? throw new ArgumentNullException(nameof(sessionFactory));
|
||||||
_connectTimeoutMilliseconds = connectTimeoutMilliseconds;
|
_connectTimeoutMilliseconds = connectTimeoutMilliseconds;
|
||||||
|
_connectAttemptTimeoutMilliseconds = connectAttemptTimeoutMilliseconds;
|
||||||
}
|
}
|
||||||
|
|
||||||
public async Task RunAsync(
|
public async Task RunAsync(
|
||||||
@@ -78,28 +108,91 @@ public sealed class WorkerPipeClient : IWorkerPipeClient
|
|||||||
|
|
||||||
WorkerFrameProtocolOptions frameOptions = new(options);
|
WorkerFrameProtocolOptions frameOptions = new(options);
|
||||||
|
|
||||||
using NamedPipeClientStream pipe = new(
|
using NamedPipeClientStream pipe = await ConnectWithRetryAsync(options.PipeName, cancellationToken)
|
||||||
".",
|
.ConfigureAwait(false);
|
||||||
options.PipeName,
|
|
||||||
PipeDirection.InOut,
|
|
||||||
PipeOptions.Asynchronous);
|
|
||||||
|
|
||||||
await ConnectAsync(pipe, cancellationToken).ConfigureAwait(false);
|
|
||||||
|
|
||||||
WorkerPipeSession session = _sessionFactory(pipe, frameOptions, _logger);
|
WorkerPipeSession session = _sessionFactory(pipe, frameOptions, _logger);
|
||||||
await session.RunAsync(cancellationToken).ConfigureAwait(false);
|
await session.RunAsync(cancellationToken).ConfigureAwait(false);
|
||||||
}
|
}
|
||||||
|
|
||||||
private Task ConnectAsync(
|
private async Task<NamedPipeClientStream> ConnectWithRetryAsync(
|
||||||
NamedPipeClientStream pipe,
|
string pipeName,
|
||||||
CancellationToken cancellationToken)
|
CancellationToken cancellationToken)
|
||||||
{
|
{
|
||||||
return Task.Run(
|
int retryAttempts = Math.Max(
|
||||||
() =>
|
0,
|
||||||
|
(_connectTimeoutMilliseconds / Math.Min(_connectTimeoutMilliseconds, _connectAttemptTimeoutMilliseconds)) - 1);
|
||||||
|
|
||||||
|
ResiliencePipeline<NamedPipeClientStream> pipeline = new ResiliencePipelineBuilder<NamedPipeClientStream>()
|
||||||
|
.AddRetry(new RetryStrategyOptions<NamedPipeClientStream>
|
||||||
{
|
{
|
||||||
cancellationToken.ThrowIfCancellationRequested();
|
MaxRetryAttempts = retryAttempts,
|
||||||
pipe.Connect(_connectTimeoutMilliseconds);
|
BackoffType = DelayBackoffType.Exponential,
|
||||||
},
|
UseJitter = true,
|
||||||
cancellationToken);
|
Delay = TimeSpan.FromMilliseconds(250),
|
||||||
|
MaxDelay = TimeSpan.FromSeconds(2),
|
||||||
|
ShouldHandle = new PredicateBuilder<NamedPipeClientStream>()
|
||||||
|
.Handle<Exception>(exception => exception is TimeoutException or IOException),
|
||||||
|
OnRetry = args =>
|
||||||
|
{
|
||||||
|
args.Outcome.Result?.Dispose();
|
||||||
|
_logger?.Information(
|
||||||
|
"WorkerPipeConnectRetry",
|
||||||
|
new Dictionary<string, object?>
|
||||||
|
{
|
||||||
|
["attempt"] = args.AttemptNumber + 1,
|
||||||
|
["pipe_name"] = pipeName,
|
||||||
|
});
|
||||||
|
return default;
|
||||||
|
},
|
||||||
|
})
|
||||||
|
.Build();
|
||||||
|
|
||||||
|
return await pipeline.ExecuteAsync(
|
||||||
|
async token => await ConnectSingleAttemptAsync(pipeName, token).ConfigureAwait(false),
|
||||||
|
cancellationToken)
|
||||||
|
.ConfigureAwait(false);
|
||||||
|
}
|
||||||
|
|
||||||
|
private async Task<NamedPipeClientStream> ConnectSingleAttemptAsync(
|
||||||
|
string pipeName,
|
||||||
|
CancellationToken cancellationToken)
|
||||||
|
{
|
||||||
|
NamedPipeClientStream pipe = new(
|
||||||
|
".",
|
||||||
|
pipeName,
|
||||||
|
PipeDirection.InOut,
|
||||||
|
PipeOptions.Asynchronous);
|
||||||
|
|
||||||
|
try
|
||||||
|
{
|
||||||
|
using CancellationTokenSource attemptTimeout =
|
||||||
|
CancellationTokenSource.CreateLinkedTokenSource(cancellationToken);
|
||||||
|
attemptTimeout.CancelAfter(_connectAttemptTimeoutMilliseconds);
|
||||||
|
|
||||||
|
await Task.Run(
|
||||||
|
() =>
|
||||||
|
{
|
||||||
|
attemptTimeout.Token.ThrowIfCancellationRequested();
|
||||||
|
pipe.Connect(_connectAttemptTimeoutMilliseconds);
|
||||||
|
},
|
||||||
|
attemptTimeout.Token)
|
||||||
|
.ConfigureAwait(false);
|
||||||
|
|
||||||
|
return pipe;
|
||||||
|
}
|
||||||
|
catch
|
||||||
|
{
|
||||||
|
pipe.Dispose();
|
||||||
|
throw;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static int ResolveDefaultConnectAttemptTimeoutMilliseconds()
|
||||||
|
{
|
||||||
|
string? configuredValue = Environment.GetEnvironmentVariable(ConnectAttemptTimeoutEnvironmentVariableName);
|
||||||
|
return int.TryParse(configuredValue, out int milliseconds) && milliseconds > 0
|
||||||
|
? milliseconds
|
||||||
|
: DefaultConnectAttemptTimeoutMilliseconds;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -10,6 +10,10 @@
|
|||||||
<GenerateBindingRedirectsOutputType>true</GenerateBindingRedirectsOutputType>
|
<GenerateBindingRedirectsOutputType>true</GenerateBindingRedirectsOutputType>
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
|
|
||||||
|
<ItemGroup>
|
||||||
|
<PackageReference Include="Polly.Core" Version="8.6.6" />
|
||||||
|
</ItemGroup>
|
||||||
|
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
<ProjectReference Include="..\MxGateway.Contracts\MxGateway.Contracts.csproj" />
|
<ProjectReference Include="..\MxGateway.Contracts\MxGateway.Contracts.csproj" />
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
|
|||||||
Reference in New Issue
Block a user