Lands the first chunk of the Phase 6.1 Stream A resilience layer per docs/v2/implementation/phase-6-1-resilience-and-observability.md §Stream A. Downstream CapabilityInvoker (A.3) + driver-dispatch wiring land in follow-up PRs on the same branch. Core.Abstractions additions: - WriteIdempotentAttribute — marker for tag-definition records that opt into auto-retry on IWritable.WriteAsync. Absence = no retry per decisions #44, #45, #143. Read once via reflection at driver-init time; no per-write cost. - DriverCapability enum — enumerates the 8 capability surface points (Read / Write / Discover / Subscribe / Probe / AlarmSubscribe / AlarmAcknowledge / HistoryRead). AlarmAcknowledge is write-shaped (no retry by default). - DriverTier enum — A/B/C per driver-stability.md §2-4. Stream B.1 wires this into DriverTypeMetadata; surfaced here because the resilience policy defaults key on it. Core.Resilience new namespace: - DriverResilienceOptions — per-tier × per-capability policy defaults. GetTierDefaults(tier) is the source of truth: * Tier A: Read 2s/3 retries, Write 2s/0 retries, breaker threshold 5 * Tier B: Read 4s/3, Write 4s/0, breaker threshold 5 * Tier C: Read 10s/1, Write 10s/0, breaker threshold 0 (supervisor handles process-level breaker per decision #68) Resolve(capability) overlays CapabilityPolicies on top of the defaults. - DriverResiliencePipelineBuilder — composes Timeout → Retry (capability- permitting, never on cancellation) → CircuitBreaker (tier-permitting) → Bulkhead. Pipelines cached in a lock-free ConcurrentDictionary keyed on (DriverInstanceId, HostName, DriverCapability) per decision #144 — one dead PLC behind a multi-device driver does not open the breaker for healthy siblings. Invalidate(driverInstanceId) supports Admin-triggered reload. Tests (30 new, all pass): - DriverResilienceOptionsTests: tier-default coverage for every capability, Write + AlarmAcknowledge never retry at any tier, Tier C disables breaker, resolve-with-override layering. - DriverResiliencePipelineBuilderTests: Read retries transients, Write does NOT retry on failure (decision #44 guard), dead-host isolation from sibling hosts, pipeline reuse for same triple, per-capability isolation, breaker opens after threshold on Tier A, timeout fires, cancellation is not retried, invalidation scoped to matching instance. Polly.Core 8.6.6 added to Core.csproj. Full solution dotnet test: 936 passing (baseline 906 + 30 new). One pre-existing Client.CLI Subscribe flake unchanged by this PR. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
97 lines
6.1 KiB
C#
97 lines
6.1 KiB
C#
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||
|
||
namespace ZB.MOM.WW.OtOpcUa.Core.Resilience;
|
||
|
||
/// <summary>
|
||
/// Per-tier × per-capability resilience policy configuration for a driver instance.
|
||
/// Bound from <c>DriverInstance.ResilienceConfig</c> JSON (nullable column; null = tier defaults).
|
||
/// Per <c>docs/v2/plan.md</c> decisions #143 and #144.
|
||
/// </summary>
|
||
public sealed record DriverResilienceOptions
|
||
{
|
||
/// <summary>Tier the owning driver type is registered as; drives the default map.</summary>
|
||
public required DriverTier Tier { get; init; }
|
||
|
||
/// <summary>
|
||
/// Per-capability policy overrides. Capabilities absent from this map fall back to
|
||
/// <see cref="GetTierDefaults(DriverTier)"/> for the configured <see cref="Tier"/>.
|
||
/// </summary>
|
||
public IReadOnlyDictionary<DriverCapability, CapabilityPolicy> CapabilityPolicies { get; init; }
|
||
= new Dictionary<DriverCapability, CapabilityPolicy>();
|
||
|
||
/// <summary>Bulkhead (max concurrent in-flight calls) for every capability. Default 32.</summary>
|
||
public int BulkheadMaxConcurrent { get; init; } = 32;
|
||
|
||
/// <summary>
|
||
/// Bulkhead queue depth. Zero = no queueing; overflow fails fast with
|
||
/// <c>BulkheadRejectedException</c>. Default 64.
|
||
/// </summary>
|
||
public int BulkheadMaxQueue { get; init; } = 64;
|
||
|
||
/// <summary>
|
||
/// Look up the effective policy for a capability, falling back to tier defaults when no
|
||
/// override is configured. Never returns null.
|
||
/// </summary>
|
||
public CapabilityPolicy Resolve(DriverCapability capability)
|
||
{
|
||
if (CapabilityPolicies.TryGetValue(capability, out var policy))
|
||
return policy;
|
||
|
||
var defaults = GetTierDefaults(Tier);
|
||
return defaults[capability];
|
||
}
|
||
|
||
/// <summary>
|
||
/// Per-tier per-capability default policy table, per decisions #143-144 and the Phase 6.1
|
||
/// Stream A.2 specification. Retries skipped on <see cref="DriverCapability.Write"/> and
|
||
/// <see cref="DriverCapability.AlarmAcknowledge"/> regardless of tier.
|
||
/// </summary>
|
||
public static IReadOnlyDictionary<DriverCapability, CapabilityPolicy> GetTierDefaults(DriverTier tier) =>
|
||
tier switch
|
||
{
|
||
DriverTier.A => new Dictionary<DriverCapability, CapabilityPolicy>
|
||
{
|
||
[DriverCapability.Read] = new(TimeoutSeconds: 2, RetryCount: 3, BreakerFailureThreshold: 5),
|
||
[DriverCapability.Write] = new(TimeoutSeconds: 2, RetryCount: 0, BreakerFailureThreshold: 5),
|
||
[DriverCapability.Discover] = new(TimeoutSeconds: 30, RetryCount: 2, BreakerFailureThreshold: 3),
|
||
[DriverCapability.Subscribe] = new(TimeoutSeconds: 5, RetryCount: 3, BreakerFailureThreshold: 5),
|
||
[DriverCapability.Probe] = new(TimeoutSeconds: 2, RetryCount: 3, BreakerFailureThreshold: 5),
|
||
[DriverCapability.AlarmSubscribe] = new(TimeoutSeconds: 5, RetryCount: 3, BreakerFailureThreshold: 5),
|
||
[DriverCapability.AlarmAcknowledge] = new(TimeoutSeconds: 5, RetryCount: 0, BreakerFailureThreshold: 5),
|
||
[DriverCapability.HistoryRead] = new(TimeoutSeconds: 30, RetryCount: 2, BreakerFailureThreshold: 5),
|
||
},
|
||
DriverTier.B => new Dictionary<DriverCapability, CapabilityPolicy>
|
||
{
|
||
[DriverCapability.Read] = new(TimeoutSeconds: 4, RetryCount: 3, BreakerFailureThreshold: 5),
|
||
[DriverCapability.Write] = new(TimeoutSeconds: 4, RetryCount: 0, BreakerFailureThreshold: 5),
|
||
[DriverCapability.Discover] = new(TimeoutSeconds: 60, RetryCount: 2, BreakerFailureThreshold: 3),
|
||
[DriverCapability.Subscribe] = new(TimeoutSeconds: 8, RetryCount: 3, BreakerFailureThreshold: 5),
|
||
[DriverCapability.Probe] = new(TimeoutSeconds: 4, RetryCount: 3, BreakerFailureThreshold: 5),
|
||
[DriverCapability.AlarmSubscribe] = new(TimeoutSeconds: 8, RetryCount: 3, BreakerFailureThreshold: 5),
|
||
[DriverCapability.AlarmAcknowledge] = new(TimeoutSeconds: 8, RetryCount: 0, BreakerFailureThreshold: 5),
|
||
[DriverCapability.HistoryRead] = new(TimeoutSeconds: 60, RetryCount: 2, BreakerFailureThreshold: 5),
|
||
},
|
||
DriverTier.C => new Dictionary<DriverCapability, CapabilityPolicy>
|
||
{
|
||
[DriverCapability.Read] = new(TimeoutSeconds: 10, RetryCount: 1, BreakerFailureThreshold: 0),
|
||
[DriverCapability.Write] = new(TimeoutSeconds: 10, RetryCount: 0, BreakerFailureThreshold: 0),
|
||
[DriverCapability.Discover] = new(TimeoutSeconds: 120, RetryCount: 1, BreakerFailureThreshold: 0),
|
||
[DriverCapability.Subscribe] = new(TimeoutSeconds: 15, RetryCount: 1, BreakerFailureThreshold: 0),
|
||
[DriverCapability.Probe] = new(TimeoutSeconds: 10, RetryCount: 1, BreakerFailureThreshold: 0),
|
||
[DriverCapability.AlarmSubscribe] = new(TimeoutSeconds: 15, RetryCount: 1, BreakerFailureThreshold: 0),
|
||
[DriverCapability.AlarmAcknowledge] = new(TimeoutSeconds: 15, RetryCount: 0, BreakerFailureThreshold: 0),
|
||
[DriverCapability.HistoryRead] = new(TimeoutSeconds: 120, RetryCount: 1, BreakerFailureThreshold: 0),
|
||
},
|
||
_ => throw new ArgumentOutOfRangeException(nameof(tier), tier, $"No default policy table defined for tier {tier}."),
|
||
};
|
||
}
|
||
|
||
/// <summary>Policy for one capability on one driver instance.</summary>
|
||
/// <param name="TimeoutSeconds">Per-call timeout (wraps the inner Polly execution).</param>
|
||
/// <param name="RetryCount">Number of retry attempts after the first failure; zero = no retry.</param>
|
||
/// <param name="BreakerFailureThreshold">
|
||
/// Consecutive-failure count that opens the circuit breaker; zero = no breaker
|
||
/// (Tier C uses the supervisor's process-level breaker instead, per decision #68).
|
||
/// </param>
|
||
public sealed record CapabilityPolicy(int TimeoutSeconds, int RetryCount, int BreakerFailureThreshold);
|