using System.Collections.Concurrent;
using Polly;
using Polly.CircuitBreaker;
using Polly.Retry;
using Polly.Timeout;
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
namespace ZB.MOM.WW.OtOpcUa.Core.Resilience;
///
/// Builds and caches Polly resilience pipelines keyed on
/// (DriverInstanceId, HostName, DriverCapability). One dead PLC behind a multi-device
/// driver cannot open the circuit breaker for healthy sibling hosts.
///
///
/// Per docs/v2/plan.md decision #144 (per-device isolation). Composition from outside-in:
/// Timeout → Retry (when capability permits) → Circuit Breaker (when tier permits) → Bulkhead.
///
/// Pipeline resolution is lock-free on the hot path: the inner
/// caches a per key;
/// first-call cost is one .Build. Thereafter reads are O(1).
///
public sealed class DriverResiliencePipelineBuilder
{
private readonly ConcurrentDictionary _pipelines = new();
private readonly TimeProvider _timeProvider;
private readonly DriverResilienceStatusTracker? _statusTracker;
/// Construct with the ambient clock (use in prod).
/// Clock source for pipeline timeouts + breaker sampling. Defaults to system.
/// When non-null, every built pipeline wires Polly telemetry into
/// the tracker — retries increment ConsecutiveFailures, breaker-open stamps
/// LastBreakerOpenUtc, breaker-close resets failures. Feeds Admin /hosts +
/// the Polly bulkhead-depth column. Absent tracker means no telemetry (unit tests +
/// deployments that don't care about resilience observability).
public DriverResiliencePipelineBuilder(
TimeProvider? timeProvider = null,
DriverResilienceStatusTracker? statusTracker = null)
{
_timeProvider = timeProvider ?? TimeProvider.System;
_statusTracker = statusTracker;
}
///
/// Get or build the pipeline for a given (driver instance, host, capability) triple.
/// Calls with the same key + same options reuse the same pipeline instance; the first caller
/// wins if a race occurs (both pipelines would be behaviourally identical).
///
/// DriverInstance primary key — opaque to this layer.
///
/// Host the call targets. For single-host drivers (Galaxy, some OPC UA Client configs) pass the
/// driver's canonical host string. For multi-host drivers (Modbus with N PLCs), pass the
/// specific PLC so one dead PLC doesn't poison healthy siblings.
///
/// Which capability surface is being called.
/// Per-driver-instance options (tier + per-capability overrides).
public ResiliencePipeline GetOrCreate(
string driverInstanceId,
string hostName,
DriverCapability capability,
DriverResilienceOptions options)
{
ArgumentNullException.ThrowIfNull(options);
ArgumentException.ThrowIfNullOrWhiteSpace(hostName);
var key = new PipelineKey(driverInstanceId, hostName, capability);
return _pipelines.GetOrAdd(key, static (k, state) => Build(
k.DriverInstanceId, k.HostName, state.capability, state.options, state.timeProvider, state.tracker),
(capability, options, timeProvider: _timeProvider, tracker: _statusTracker));
}
/// Drop cached pipelines for one driver instance (e.g. on ResilienceConfig change). Test + Admin-reload use.
public int Invalidate(string driverInstanceId)
{
var removed = 0;
foreach (var key in _pipelines.Keys)
{
if (key.DriverInstanceId == driverInstanceId && _pipelines.TryRemove(key, out _))
removed++;
}
return removed;
}
/// Snapshot of the current number of cached pipelines. For diagnostics only.
public int CachedPipelineCount => _pipelines.Count;
private static ResiliencePipeline Build(
string driverInstanceId,
string hostName,
DriverCapability capability,
DriverResilienceOptions options,
TimeProvider timeProvider,
DriverResilienceStatusTracker? tracker)
{
var policy = options.Resolve(capability);
var builder = new ResiliencePipelineBuilder { TimeProvider = timeProvider };
builder.AddTimeout(new TimeoutStrategyOptions
{
Timeout = TimeSpan.FromSeconds(policy.TimeoutSeconds),
});
if (policy.RetryCount > 0)
{
var retryOptions = new RetryStrategyOptions
{
MaxRetryAttempts = policy.RetryCount,
BackoffType = DelayBackoffType.Exponential,
UseJitter = true,
Delay = TimeSpan.FromMilliseconds(100),
MaxDelay = TimeSpan.FromSeconds(5),
ShouldHandle = new PredicateBuilder().Handle(ex => ex is not OperationCanceledException),
};
if (tracker is not null)
{
retryOptions.OnRetry = args =>
{
tracker.RecordFailure(driverInstanceId, hostName, timeProvider.GetUtcNow().UtcDateTime);
return default;
};
}
builder.AddRetry(retryOptions);
}
if (policy.BreakerFailureThreshold > 0)
{
var breakerOptions = new CircuitBreakerStrategyOptions
{
FailureRatio = 1.0,
MinimumThroughput = policy.BreakerFailureThreshold,
SamplingDuration = TimeSpan.FromSeconds(30),
BreakDuration = TimeSpan.FromSeconds(15),
ShouldHandle = new PredicateBuilder().Handle(ex => ex is not OperationCanceledException),
};
if (tracker is not null)
{
breakerOptions.OnOpened = args =>
{
tracker.RecordBreakerOpen(driverInstanceId, hostName, timeProvider.GetUtcNow().UtcDateTime);
return default;
};
breakerOptions.OnClosed = args =>
{
// Closing the breaker means the target recovered — reset the consecutive-
// failure counter so Admin UI stops flashing red for this host.
tracker.RecordSuccess(driverInstanceId, hostName, timeProvider.GetUtcNow().UtcDateTime);
return default;
};
}
builder.AddCircuitBreaker(breakerOptions);
}
return builder.Build();
}
private readonly record struct PipelineKey(string DriverInstanceId, string HostName, DriverCapability Capability);
}