using System.Collections.Concurrent; using Polly; using Polly.CircuitBreaker; using Polly.Retry; using Polly.Timeout; using ZB.MOM.WW.OtOpcUa.Core.Abstractions; namespace ZB.MOM.WW.OtOpcUa.Core.Resilience; /// /// Builds and caches Polly resilience pipelines keyed on /// (DriverInstanceId, HostName, DriverCapability). One dead PLC behind a multi-device /// driver cannot open the circuit breaker for healthy sibling hosts. /// /// /// Per docs/v2/plan.md decision #144 (per-device isolation). Composition from outside-in: /// Timeout → Retry (when capability permits) → Circuit Breaker (when tier permits) → Bulkhead. /// /// Pipeline resolution is lock-free on the hot path: the inner /// caches a per key; /// first-call cost is one .Build. Thereafter reads are O(1). /// public sealed class DriverResiliencePipelineBuilder { private readonly ConcurrentDictionary _pipelines = new(); private readonly TimeProvider _timeProvider; private readonly DriverResilienceStatusTracker? _statusTracker; /// Construct with the ambient clock (use in prod). /// Clock source for pipeline timeouts + breaker sampling. Defaults to system. /// When non-null, every built pipeline wires Polly telemetry into /// the tracker — retries increment ConsecutiveFailures, breaker-open stamps /// LastBreakerOpenUtc, breaker-close resets failures. Feeds Admin /hosts + /// the Polly bulkhead-depth column. Absent tracker means no telemetry (unit tests + /// deployments that don't care about resilience observability). public DriverResiliencePipelineBuilder( TimeProvider? timeProvider = null, DriverResilienceStatusTracker? statusTracker = null) { _timeProvider = timeProvider ?? TimeProvider.System; _statusTracker = statusTracker; } /// /// Get or build the pipeline for a given (driver instance, host, capability) triple. /// Calls with the same key + same options reuse the same pipeline instance; the first caller /// wins if a race occurs (both pipelines would be behaviourally identical). /// /// DriverInstance primary key — opaque to this layer. /// /// Host the call targets. For single-host drivers (Galaxy, some OPC UA Client configs) pass the /// driver's canonical host string. For multi-host drivers (Modbus with N PLCs), pass the /// specific PLC so one dead PLC doesn't poison healthy siblings. /// /// Which capability surface is being called. /// Per-driver-instance options (tier + per-capability overrides). public ResiliencePipeline GetOrCreate( string driverInstanceId, string hostName, DriverCapability capability, DriverResilienceOptions options) { ArgumentNullException.ThrowIfNull(options); ArgumentException.ThrowIfNullOrWhiteSpace(hostName); var key = new PipelineKey(driverInstanceId, hostName, capability); return _pipelines.GetOrAdd(key, static (k, state) => Build( k.DriverInstanceId, k.HostName, state.capability, state.options, state.timeProvider, state.tracker), (capability, options, timeProvider: _timeProvider, tracker: _statusTracker)); } /// Drop cached pipelines for one driver instance (e.g. on ResilienceConfig change). Test + Admin-reload use. public int Invalidate(string driverInstanceId) { var removed = 0; foreach (var key in _pipelines.Keys) { if (key.DriverInstanceId == driverInstanceId && _pipelines.TryRemove(key, out _)) removed++; } return removed; } /// Snapshot of the current number of cached pipelines. For diagnostics only. public int CachedPipelineCount => _pipelines.Count; private static ResiliencePipeline Build( string driverInstanceId, string hostName, DriverCapability capability, DriverResilienceOptions options, TimeProvider timeProvider, DriverResilienceStatusTracker? tracker) { var policy = options.Resolve(capability); var builder = new ResiliencePipelineBuilder { TimeProvider = timeProvider }; builder.AddTimeout(new TimeoutStrategyOptions { Timeout = TimeSpan.FromSeconds(policy.TimeoutSeconds), }); if (policy.RetryCount > 0) { var retryOptions = new RetryStrategyOptions { MaxRetryAttempts = policy.RetryCount, BackoffType = DelayBackoffType.Exponential, UseJitter = true, Delay = TimeSpan.FromMilliseconds(100), MaxDelay = TimeSpan.FromSeconds(5), ShouldHandle = new PredicateBuilder().Handle(ex => ex is not OperationCanceledException), }; if (tracker is not null) { retryOptions.OnRetry = args => { tracker.RecordFailure(driverInstanceId, hostName, timeProvider.GetUtcNow().UtcDateTime); return default; }; } builder.AddRetry(retryOptions); } if (policy.BreakerFailureThreshold > 0) { var breakerOptions = new CircuitBreakerStrategyOptions { FailureRatio = 1.0, MinimumThroughput = policy.BreakerFailureThreshold, SamplingDuration = TimeSpan.FromSeconds(30), BreakDuration = TimeSpan.FromSeconds(15), ShouldHandle = new PredicateBuilder().Handle(ex => ex is not OperationCanceledException), }; if (tracker is not null) { breakerOptions.OnOpened = args => { tracker.RecordBreakerOpen(driverInstanceId, hostName, timeProvider.GetUtcNow().UtcDateTime); return default; }; breakerOptions.OnClosed = args => { // Closing the breaker means the target recovered — reset the consecutive- // failure counter so Admin UI stops flashing red for this host. tracker.RecordSuccess(driverInstanceId, hostName, timeProvider.GetUtcNow().UtcDateTime); return default; }; } builder.AddCircuitBreaker(breakerOptions); } return builder.Build(); } private readonly record struct PipelineKey(string DriverInstanceId, string HostName, DriverCapability Capability); }