using Grpc.Core; using Microsoft.Extensions.Logging; using MxGateway.Contracts.Proto; using Polly; using Polly.Retry; namespace MxGateway.Client; /// Factory and helpers for exponential-backoff retry policies on transient gRPC failures. internal static class MxGatewayClientRetryPolicy { /// Creates a Polly ResiliencePipeline that retries transient gRPC failures with exponential backoff. /// Retry configuration (max attempts, delay bounds, jitter). /// Optional logger for retry diagnostics. public static ResiliencePipeline Create( MxGatewayClientRetryOptions options, ILogger? logger) { ArgumentNullException.ThrowIfNull(options); options.Validate(); return new ResiliencePipelineBuilder() .AddRetry(new RetryStrategyOptions { MaxRetryAttempts = Math.Max(0, options.MaxAttempts - 1), BackoffType = DelayBackoffType.Exponential, UseJitter = options.UseJitter, Delay = options.Delay, MaxDelay = options.MaxDelay, ShouldHandle = new PredicateBuilder().Handle(IsTransientGrpcFailure), OnRetry = args => { logger?.LogDebug( args.Outcome.Exception, "Retrying MXAccess Gateway client call after transient gRPC failure. Attempt {Attempt}.", args.AttemptNumber + 1); return default; }, }) .Build(); } /// Returns whether a command kind is eligible for automatic retry on transient failures. /// The command kind to check. public static bool IsRetryableCommand(MxCommandKind kind) { return kind is MxCommandKind.Ping or MxCommandKind.GetSessionState or MxCommandKind.GetWorkerInfo; } private static bool IsTransientGrpcFailure(Exception exception) { return exception switch { RpcException rpcException => IsTransientStatus(rpcException.StatusCode), MxGatewayException { InnerException: RpcException rpcException } => IsTransientStatus(rpcException.StatusCode), _ => false, }; } private static bool IsTransientStatus(StatusCode statusCode) { // DeadlineExceeded is intentionally NOT treated as transient. The deadline // on every unary call is client-imposed (CreateCallOptions stamps the // DefaultCallTimeout budget), and that same budget is shared across the // initial attempt plus all retries plus backoff. A DeadlineExceeded means // the shared budget is exhausted, so an immediate retry would only fail // again — burning the remaining budget on a call that cannot succeed. return statusCode is StatusCode.Unavailable or StatusCode.ResourceExhausted; } }