using Grpc.Core;
using Microsoft.Extensions.Logging;
using MxGateway.Contracts.Proto;
using Polly;
using Polly.Retry;
namespace MxGateway.Client;
/// Factory and helpers for exponential-backoff retry policies on transient gRPC failures.
internal static class MxGatewayClientRetryPolicy
{
/// Creates a Polly ResiliencePipeline that retries transient gRPC failures with exponential backoff.
/// Retry configuration (max attempts, delay bounds, jitter).
/// Optional logger for retry diagnostics.
public static ResiliencePipeline Create(
MxGatewayClientRetryOptions options,
ILogger? logger)
{
ArgumentNullException.ThrowIfNull(options);
options.Validate();
return new ResiliencePipelineBuilder()
.AddRetry(new RetryStrategyOptions
{
MaxRetryAttempts = Math.Max(0, options.MaxAttempts - 1),
BackoffType = DelayBackoffType.Exponential,
UseJitter = options.UseJitter,
Delay = options.Delay,
MaxDelay = options.MaxDelay,
ShouldHandle = new PredicateBuilder().Handle(IsTransientGrpcFailure),
OnRetry = args =>
{
logger?.LogDebug(
args.Outcome.Exception,
"Retrying MXAccess Gateway client call after transient gRPC failure. Attempt {Attempt}.",
args.AttemptNumber + 1);
return default;
},
})
.Build();
}
/// Returns whether a command kind is eligible for automatic retry on transient failures.
/// The command kind to check.
public static bool IsRetryableCommand(MxCommandKind kind)
{
return kind is MxCommandKind.Ping
or MxCommandKind.GetSessionState
or MxCommandKind.GetWorkerInfo;
}
private static bool IsTransientGrpcFailure(Exception exception)
{
return exception switch
{
RpcException rpcException => IsTransientStatus(rpcException.StatusCode),
MxGatewayException { InnerException: RpcException rpcException } => IsTransientStatus(rpcException.StatusCode),
_ => false,
};
}
private static bool IsTransientStatus(StatusCode statusCode)
{
// DeadlineExceeded is intentionally NOT treated as transient. The deadline
// on every unary call is client-imposed (CreateCallOptions stamps the
// DefaultCallTimeout budget), and that same budget is shared across the
// initial attempt plus all retries plus backoff. A DeadlineExceeded means
// the shared budget is exhausted, so an immediate retry would only fail
// again — burning the remaining budget on a call that cannot succeed.
return statusCode is StatusCode.Unavailable
or StatusCode.ResourceExhausted;
}
}