using System.Collections.Concurrent; using Polly; using Polly.CircuitBreaker; using Polly.Retry; using Polly.Timeout; using ZB.MOM.WW.OtOpcUa.Core.Abstractions; namespace ZB.MOM.WW.OtOpcUa.Core.Resilience; /// /// Builds and caches Polly resilience pipelines keyed on /// (DriverInstanceId, HostName, DriverCapability). One dead PLC behind a multi-device /// driver cannot open the circuit breaker for healthy sibling hosts. /// /// /// Per docs/v2/plan.md decision #144 (per-device isolation). Composition from outside-in: /// Timeout → Retry (when capability permits) → Circuit Breaker (when tier permits) → Bulkhead. /// /// Pipeline resolution is lock-free on the hot path: the inner /// caches a per key; /// first-call cost is one .Build. Thereafter reads are O(1). /// public sealed class DriverResiliencePipelineBuilder { private readonly ConcurrentDictionary _pipelines = new(); private readonly TimeProvider _timeProvider; /// Construct with the ambient clock (use in prod). public DriverResiliencePipelineBuilder(TimeProvider? timeProvider = null) { _timeProvider = timeProvider ?? TimeProvider.System; } /// /// Get or build the pipeline for a given (driver instance, host, capability) triple. /// Calls with the same key + same options reuse the same pipeline instance; the first caller /// wins if a race occurs (both pipelines would be behaviourally identical). /// /// DriverInstance primary key — opaque to this layer. /// /// Host the call targets. For single-host drivers (Galaxy, some OPC UA Client configs) pass the /// driver's canonical host string. For multi-host drivers (Modbus with N PLCs), pass the /// specific PLC so one dead PLC doesn't poison healthy siblings. /// /// Which capability surface is being called. /// Per-driver-instance options (tier + per-capability overrides). public ResiliencePipeline GetOrCreate( string driverInstanceId, string hostName, DriverCapability capability, DriverResilienceOptions options) { ArgumentNullException.ThrowIfNull(options); ArgumentException.ThrowIfNullOrWhiteSpace(hostName); var key = new PipelineKey(driverInstanceId, hostName, capability); return _pipelines.GetOrAdd(key, static (_, state) => Build(state.capability, state.options, state.timeProvider), (capability, options, timeProvider: _timeProvider)); } /// Drop cached pipelines for one driver instance (e.g. on ResilienceConfig change). Test + Admin-reload use. public int Invalidate(string driverInstanceId) { var removed = 0; foreach (var key in _pipelines.Keys) { if (key.DriverInstanceId == driverInstanceId && _pipelines.TryRemove(key, out _)) removed++; } return removed; } /// Snapshot of the current number of cached pipelines. For diagnostics only. public int CachedPipelineCount => _pipelines.Count; private static ResiliencePipeline Build( DriverCapability capability, DriverResilienceOptions options, TimeProvider timeProvider) { var policy = options.Resolve(capability); var builder = new ResiliencePipelineBuilder { TimeProvider = timeProvider }; builder.AddTimeout(new TimeoutStrategyOptions { Timeout = TimeSpan.FromSeconds(policy.TimeoutSeconds), }); if (policy.RetryCount > 0) { builder.AddRetry(new RetryStrategyOptions { MaxRetryAttempts = policy.RetryCount, BackoffType = DelayBackoffType.Exponential, UseJitter = true, Delay = TimeSpan.FromMilliseconds(100), MaxDelay = TimeSpan.FromSeconds(5), ShouldHandle = new PredicateBuilder().Handle(ex => ex is not OperationCanceledException), }); } if (policy.BreakerFailureThreshold > 0) { builder.AddCircuitBreaker(new CircuitBreakerStrategyOptions { FailureRatio = 1.0, MinimumThroughput = policy.BreakerFailureThreshold, SamplingDuration = TimeSpan.FromSeconds(30), BreakDuration = TimeSpan.FromSeconds(15), ShouldHandle = new PredicateBuilder().Handle(ex => ex is not OperationCanceledException), }); } return builder.Build(); } private readonly record struct PipelineKey(string DriverInstanceId, string HostName, DriverCapability Capability); }