Phase 6.1 Stream A.1/A.2/A.6 — Polly resilience foundation: pipeline builder + per-tier policy defaults + WriteIdempotent attribute
Lands the first chunk of the Phase 6.1 Stream A resilience layer per docs/v2/implementation/phase-6-1-resilience-and-observability.md §Stream A. Downstream CapabilityInvoker (A.3) + driver-dispatch wiring land in follow-up PRs on the same branch. Core.Abstractions additions: - WriteIdempotentAttribute — marker for tag-definition records that opt into auto-retry on IWritable.WriteAsync. Absence = no retry per decisions #44, #45, #143. Read once via reflection at driver-init time; no per-write cost. - DriverCapability enum — enumerates the 8 capability surface points (Read / Write / Discover / Subscribe / Probe / AlarmSubscribe / AlarmAcknowledge / HistoryRead). AlarmAcknowledge is write-shaped (no retry by default). - DriverTier enum — A/B/C per driver-stability.md §2-4. Stream B.1 wires this into DriverTypeMetadata; surfaced here because the resilience policy defaults key on it. Core.Resilience new namespace: - DriverResilienceOptions — per-tier × per-capability policy defaults. GetTierDefaults(tier) is the source of truth: * Tier A: Read 2s/3 retries, Write 2s/0 retries, breaker threshold 5 * Tier B: Read 4s/3, Write 4s/0, breaker threshold 5 * Tier C: Read 10s/1, Write 10s/0, breaker threshold 0 (supervisor handles process-level breaker per decision #68) Resolve(capability) overlays CapabilityPolicies on top of the defaults. - DriverResiliencePipelineBuilder — composes Timeout → Retry (capability- permitting, never on cancellation) → CircuitBreaker (tier-permitting) → Bulkhead. Pipelines cached in a lock-free ConcurrentDictionary keyed on (DriverInstanceId, HostName, DriverCapability) per decision #144 — one dead PLC behind a multi-device driver does not open the breaker for healthy siblings. Invalidate(driverInstanceId) supports Admin-triggered reload. Tests (30 new, all pass): - DriverResilienceOptionsTests: tier-default coverage for every capability, Write + AlarmAcknowledge never retry at any tier, Tier C disables breaker, resolve-with-override layering. - DriverResiliencePipelineBuilderTests: Read retries transients, Write does NOT retry on failure (decision #44 guard), dead-host isolation from sibling hosts, pipeline reuse for same triple, per-capability isolation, breaker opens after threshold on Tier A, timeout fires, cancellation is not retried, invalidation scoped to matching instance. Polly.Core 8.6.6 added to Core.csproj. Full solution dotnet test: 936 passing (baseline 906 + 30 new). One pre-existing Client.CLI Subscribe flake unchanged by this PR. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,102 @@
|
||||
using Shouldly;
|
||||
using Xunit;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Resilience;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Core.Tests.Resilience;
|
||||
|
||||
[Trait("Category", "Unit")]
|
||||
public sealed class DriverResilienceOptionsTests
|
||||
{
|
||||
[Theory]
|
||||
[InlineData(DriverTier.A)]
|
||||
[InlineData(DriverTier.B)]
|
||||
[InlineData(DriverTier.C)]
|
||||
public void TierDefaults_Cover_EveryCapability(DriverTier tier)
|
||||
{
|
||||
var defaults = DriverResilienceOptions.GetTierDefaults(tier);
|
||||
|
||||
foreach (var capability in Enum.GetValues<DriverCapability>())
|
||||
defaults.ShouldContainKey(capability);
|
||||
}
|
||||
|
||||
[Theory]
|
||||
[InlineData(DriverTier.A)]
|
||||
[InlineData(DriverTier.B)]
|
||||
[InlineData(DriverTier.C)]
|
||||
public void Write_NeverRetries_ByDefault(DriverTier tier)
|
||||
{
|
||||
var defaults = DriverResilienceOptions.GetTierDefaults(tier);
|
||||
defaults[DriverCapability.Write].RetryCount.ShouldBe(0);
|
||||
}
|
||||
|
||||
[Theory]
|
||||
[InlineData(DriverTier.A)]
|
||||
[InlineData(DriverTier.B)]
|
||||
[InlineData(DriverTier.C)]
|
||||
public void AlarmAcknowledge_NeverRetries_ByDefault(DriverTier tier)
|
||||
{
|
||||
var defaults = DriverResilienceOptions.GetTierDefaults(tier);
|
||||
defaults[DriverCapability.AlarmAcknowledge].RetryCount.ShouldBe(0);
|
||||
}
|
||||
|
||||
[Theory]
|
||||
[InlineData(DriverTier.A, DriverCapability.Read)]
|
||||
[InlineData(DriverTier.A, DriverCapability.HistoryRead)]
|
||||
[InlineData(DriverTier.B, DriverCapability.Discover)]
|
||||
[InlineData(DriverTier.B, DriverCapability.Probe)]
|
||||
[InlineData(DriverTier.C, DriverCapability.AlarmSubscribe)]
|
||||
public void IdempotentCapabilities_Retry_ByDefault(DriverTier tier, DriverCapability capability)
|
||||
{
|
||||
var defaults = DriverResilienceOptions.GetTierDefaults(tier);
|
||||
defaults[capability].RetryCount.ShouldBeGreaterThan(0);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TierC_DisablesCircuitBreaker_DeferringToSupervisor()
|
||||
{
|
||||
var defaults = DriverResilienceOptions.GetTierDefaults(DriverTier.C);
|
||||
|
||||
foreach (var (_, policy) in defaults)
|
||||
policy.BreakerFailureThreshold.ShouldBe(0, "Tier C breaker is handled by the Proxy supervisor (decision #68)");
|
||||
}
|
||||
|
||||
[Theory]
|
||||
[InlineData(DriverTier.A)]
|
||||
[InlineData(DriverTier.B)]
|
||||
public void TierAAndB_EnableCircuitBreaker(DriverTier tier)
|
||||
{
|
||||
var defaults = DriverResilienceOptions.GetTierDefaults(tier);
|
||||
|
||||
foreach (var (_, policy) in defaults)
|
||||
policy.BreakerFailureThreshold.ShouldBeGreaterThan(0);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Resolve_Uses_TierDefaults_When_NoOverride()
|
||||
{
|
||||
var options = new DriverResilienceOptions { Tier = DriverTier.A };
|
||||
|
||||
var resolved = options.Resolve(DriverCapability.Read);
|
||||
|
||||
resolved.ShouldBe(DriverResilienceOptions.GetTierDefaults(DriverTier.A)[DriverCapability.Read]);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Resolve_Uses_Override_When_Configured()
|
||||
{
|
||||
var custom = new CapabilityPolicy(TimeoutSeconds: 42, RetryCount: 7, BreakerFailureThreshold: 9);
|
||||
var options = new DriverResilienceOptions
|
||||
{
|
||||
Tier = DriverTier.A,
|
||||
CapabilityPolicies = new Dictionary<DriverCapability, CapabilityPolicy>
|
||||
{
|
||||
[DriverCapability.Read] = custom,
|
||||
},
|
||||
};
|
||||
|
||||
options.Resolve(DriverCapability.Read).ShouldBe(custom);
|
||||
options.Resolve(DriverCapability.Write).ShouldBe(
|
||||
DriverResilienceOptions.GetTierDefaults(DriverTier.A)[DriverCapability.Write]);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,222 @@
|
||||
using Polly.CircuitBreaker;
|
||||
using Polly.Timeout;
|
||||
using Shouldly;
|
||||
using Xunit;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Resilience;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Core.Tests.Resilience;
|
||||
|
||||
[Trait("Category", "Unit")]
|
||||
public sealed class DriverResiliencePipelineBuilderTests
|
||||
{
|
||||
private static readonly DriverResilienceOptions TierAOptions = new() { Tier = DriverTier.A };
|
||||
|
||||
[Fact]
|
||||
public async Task Read_Retries_Transient_Failures()
|
||||
{
|
||||
var builder = new DriverResiliencePipelineBuilder();
|
||||
var pipeline = builder.GetOrCreate(Guid.NewGuid(), "host-1", DriverCapability.Read, TierAOptions);
|
||||
var attempts = 0;
|
||||
|
||||
await pipeline.ExecuteAsync(async _ =>
|
||||
{
|
||||
attempts++;
|
||||
if (attempts < 3) throw new InvalidOperationException("transient");
|
||||
await Task.Yield();
|
||||
});
|
||||
|
||||
attempts.ShouldBe(3);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Write_DoesNotRetry_OnFailure()
|
||||
{
|
||||
var builder = new DriverResiliencePipelineBuilder();
|
||||
var pipeline = builder.GetOrCreate(Guid.NewGuid(), "host-1", DriverCapability.Write, TierAOptions);
|
||||
var attempts = 0;
|
||||
|
||||
var ex = await Should.ThrowAsync<InvalidOperationException>(async () =>
|
||||
{
|
||||
await pipeline.ExecuteAsync(async _ =>
|
||||
{
|
||||
attempts++;
|
||||
await Task.Yield();
|
||||
throw new InvalidOperationException("boom");
|
||||
});
|
||||
});
|
||||
|
||||
attempts.ShouldBe(1);
|
||||
ex.Message.ShouldBe("boom");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task AlarmAcknowledge_DoesNotRetry_OnFailure()
|
||||
{
|
||||
var builder = new DriverResiliencePipelineBuilder();
|
||||
var pipeline = builder.GetOrCreate(Guid.NewGuid(), "host-1", DriverCapability.AlarmAcknowledge, TierAOptions);
|
||||
var attempts = 0;
|
||||
|
||||
await Should.ThrowAsync<InvalidOperationException>(async () =>
|
||||
{
|
||||
await pipeline.ExecuteAsync(async _ =>
|
||||
{
|
||||
attempts++;
|
||||
await Task.Yield();
|
||||
throw new InvalidOperationException("boom");
|
||||
});
|
||||
});
|
||||
|
||||
attempts.ShouldBe(1);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Pipeline_IsIsolated_PerHost()
|
||||
{
|
||||
var builder = new DriverResiliencePipelineBuilder();
|
||||
var driverId = Guid.NewGuid();
|
||||
|
||||
var hostA = builder.GetOrCreate(driverId, "host-a", DriverCapability.Read, TierAOptions);
|
||||
var hostB = builder.GetOrCreate(driverId, "host-b", DriverCapability.Read, TierAOptions);
|
||||
|
||||
hostA.ShouldNotBeSameAs(hostB);
|
||||
builder.CachedPipelineCount.ShouldBe(2);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Pipeline_IsReused_ForSameTriple()
|
||||
{
|
||||
var builder = new DriverResiliencePipelineBuilder();
|
||||
var driverId = Guid.NewGuid();
|
||||
|
||||
var first = builder.GetOrCreate(driverId, "host-a", DriverCapability.Read, TierAOptions);
|
||||
var second = builder.GetOrCreate(driverId, "host-a", DriverCapability.Read, TierAOptions);
|
||||
|
||||
first.ShouldBeSameAs(second);
|
||||
builder.CachedPipelineCount.ShouldBe(1);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Pipeline_IsIsolated_PerCapability()
|
||||
{
|
||||
var builder = new DriverResiliencePipelineBuilder();
|
||||
var driverId = Guid.NewGuid();
|
||||
|
||||
var read = builder.GetOrCreate(driverId, "host-a", DriverCapability.Read, TierAOptions);
|
||||
var write = builder.GetOrCreate(driverId, "host-a", DriverCapability.Write, TierAOptions);
|
||||
|
||||
read.ShouldNotBeSameAs(write);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task DeadHost_DoesNotOpenBreaker_ForSiblingHost()
|
||||
{
|
||||
var builder = new DriverResiliencePipelineBuilder();
|
||||
var driverId = Guid.NewGuid();
|
||||
|
||||
var deadHost = builder.GetOrCreate(driverId, "dead-plc", DriverCapability.Read, TierAOptions);
|
||||
var liveHost = builder.GetOrCreate(driverId, "live-plc", DriverCapability.Read, TierAOptions);
|
||||
|
||||
var threshold = TierAOptions.Resolve(DriverCapability.Read).BreakerFailureThreshold;
|
||||
for (var i = 0; i < threshold + 5; i++)
|
||||
{
|
||||
await Should.ThrowAsync<Exception>(async () =>
|
||||
await deadHost.ExecuteAsync(async _ =>
|
||||
{
|
||||
await Task.Yield();
|
||||
throw new InvalidOperationException("dead plc");
|
||||
}));
|
||||
}
|
||||
|
||||
var liveAttempts = 0;
|
||||
await liveHost.ExecuteAsync(async _ =>
|
||||
{
|
||||
liveAttempts++;
|
||||
await Task.Yield();
|
||||
});
|
||||
|
||||
liveAttempts.ShouldBe(1, "healthy sibling host must not be affected by dead peer");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task CircuitBreaker_Opens_AfterFailureThreshold_OnTierA()
|
||||
{
|
||||
var builder = new DriverResiliencePipelineBuilder();
|
||||
var pipeline = builder.GetOrCreate(Guid.NewGuid(), "host-1", DriverCapability.Write, TierAOptions);
|
||||
|
||||
var threshold = TierAOptions.Resolve(DriverCapability.Write).BreakerFailureThreshold;
|
||||
for (var i = 0; i < threshold; i++)
|
||||
{
|
||||
await Should.ThrowAsync<InvalidOperationException>(async () =>
|
||||
await pipeline.ExecuteAsync(async _ =>
|
||||
{
|
||||
await Task.Yield();
|
||||
throw new InvalidOperationException("boom");
|
||||
}));
|
||||
}
|
||||
|
||||
await Should.ThrowAsync<BrokenCircuitException>(async () =>
|
||||
await pipeline.ExecuteAsync(async _ =>
|
||||
{
|
||||
await Task.Yield();
|
||||
}));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Timeout_Cancels_SlowOperation()
|
||||
{
|
||||
var tierAWithShortTimeout = new DriverResilienceOptions
|
||||
{
|
||||
Tier = DriverTier.A,
|
||||
CapabilityPolicies = new Dictionary<DriverCapability, CapabilityPolicy>
|
||||
{
|
||||
[DriverCapability.Read] = new(TimeoutSeconds: 1, RetryCount: 0, BreakerFailureThreshold: 5),
|
||||
},
|
||||
};
|
||||
var builder = new DriverResiliencePipelineBuilder();
|
||||
var pipeline = builder.GetOrCreate(Guid.NewGuid(), "host-1", DriverCapability.Read, tierAWithShortTimeout);
|
||||
|
||||
await Should.ThrowAsync<TimeoutRejectedException>(async () =>
|
||||
await pipeline.ExecuteAsync(async ct =>
|
||||
{
|
||||
await Task.Delay(TimeSpan.FromSeconds(5), ct);
|
||||
}));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Invalidate_Removes_OnlyMatchingInstance()
|
||||
{
|
||||
var builder = new DriverResiliencePipelineBuilder();
|
||||
var keepId = Guid.NewGuid();
|
||||
var dropId = Guid.NewGuid();
|
||||
|
||||
builder.GetOrCreate(keepId, "h", DriverCapability.Read, TierAOptions);
|
||||
builder.GetOrCreate(keepId, "h", DriverCapability.Write, TierAOptions);
|
||||
builder.GetOrCreate(dropId, "h", DriverCapability.Read, TierAOptions);
|
||||
|
||||
var removed = builder.Invalidate(dropId);
|
||||
|
||||
removed.ShouldBe(1);
|
||||
builder.CachedPipelineCount.ShouldBe(2);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Cancellation_IsNot_Retried()
|
||||
{
|
||||
var builder = new DriverResiliencePipelineBuilder();
|
||||
var pipeline = builder.GetOrCreate(Guid.NewGuid(), "host-1", DriverCapability.Read, TierAOptions);
|
||||
var attempts = 0;
|
||||
using var cts = new CancellationTokenSource();
|
||||
cts.Cancel();
|
||||
|
||||
await Should.ThrowAsync<OperationCanceledException>(async () =>
|
||||
await pipeline.ExecuteAsync(async ct =>
|
||||
{
|
||||
attempts++;
|
||||
ct.ThrowIfCancellationRequested();
|
||||
await Task.Yield();
|
||||
}, cts.Token));
|
||||
|
||||
attempts.ShouldBeLessThanOrEqualTo(1);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user