Lands the first chunk of the Phase 6.1 Stream A resilience layer per docs/v2/implementation/phase-6-1-resilience-and-observability.md §Stream A. Downstream CapabilityInvoker (A.3) + driver-dispatch wiring land in follow-up PRs on the same branch. Core.Abstractions additions: - WriteIdempotentAttribute — marker for tag-definition records that opt into auto-retry on IWritable.WriteAsync. Absence = no retry per decisions #44, #45, #143. Read once via reflection at driver-init time; no per-write cost. - DriverCapability enum — enumerates the 8 capability surface points (Read / Write / Discover / Subscribe / Probe / AlarmSubscribe / AlarmAcknowledge / HistoryRead). AlarmAcknowledge is write-shaped (no retry by default). - DriverTier enum — A/B/C per driver-stability.md §2-4. Stream B.1 wires this into DriverTypeMetadata; surfaced here because the resilience policy defaults key on it. Core.Resilience new namespace: - DriverResilienceOptions — per-tier × per-capability policy defaults. GetTierDefaults(tier) is the source of truth: * Tier A: Read 2s/3 retries, Write 2s/0 retries, breaker threshold 5 * Tier B: Read 4s/3, Write 4s/0, breaker threshold 5 * Tier C: Read 10s/1, Write 10s/0, breaker threshold 0 (supervisor handles process-level breaker per decision #68) Resolve(capability) overlays CapabilityPolicies on top of the defaults. - DriverResiliencePipelineBuilder — composes Timeout → Retry (capability- permitting, never on cancellation) → CircuitBreaker (tier-permitting) → Bulkhead. Pipelines cached in a lock-free ConcurrentDictionary keyed on (DriverInstanceId, HostName, DriverCapability) per decision #144 — one dead PLC behind a multi-device driver does not open the breaker for healthy siblings. Invalidate(driverInstanceId) supports Admin-triggered reload. Tests (30 new, all pass): - DriverResilienceOptionsTests: tier-default coverage for every capability, Write + AlarmAcknowledge never retry at any tier, Tier C disables breaker, resolve-with-override layering. - DriverResiliencePipelineBuilderTests: Read retries transients, Write does NOT retry on failure (decision #44 guard), dead-host isolation from sibling hosts, pipeline reuse for same triple, per-capability isolation, breaker opens after threshold on Tier A, timeout fires, cancellation is not retried, invalidation scoped to matching instance. Polly.Core 8.6.6 added to Core.csproj. Full solution dotnet test: 936 passing (baseline 906 + 30 new). One pre-existing Client.CLI Subscribe flake unchanged by this PR. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
223 lines
7.4 KiB
C#
223 lines
7.4 KiB
C#
using Polly.CircuitBreaker;
|
|
using Polly.Timeout;
|
|
using Shouldly;
|
|
using Xunit;
|
|
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
|
using ZB.MOM.WW.OtOpcUa.Core.Resilience;
|
|
|
|
namespace ZB.MOM.WW.OtOpcUa.Core.Tests.Resilience;
|
|
|
|
[Trait("Category", "Unit")]
|
|
public sealed class DriverResiliencePipelineBuilderTests
|
|
{
|
|
private static readonly DriverResilienceOptions TierAOptions = new() { Tier = DriverTier.A };
|
|
|
|
[Fact]
|
|
public async Task Read_Retries_Transient_Failures()
|
|
{
|
|
var builder = new DriverResiliencePipelineBuilder();
|
|
var pipeline = builder.GetOrCreate(Guid.NewGuid(), "host-1", DriverCapability.Read, TierAOptions);
|
|
var attempts = 0;
|
|
|
|
await pipeline.ExecuteAsync(async _ =>
|
|
{
|
|
attempts++;
|
|
if (attempts < 3) throw new InvalidOperationException("transient");
|
|
await Task.Yield();
|
|
});
|
|
|
|
attempts.ShouldBe(3);
|
|
}
|
|
|
|
[Fact]
|
|
public async Task Write_DoesNotRetry_OnFailure()
|
|
{
|
|
var builder = new DriverResiliencePipelineBuilder();
|
|
var pipeline = builder.GetOrCreate(Guid.NewGuid(), "host-1", DriverCapability.Write, TierAOptions);
|
|
var attempts = 0;
|
|
|
|
var ex = await Should.ThrowAsync<InvalidOperationException>(async () =>
|
|
{
|
|
await pipeline.ExecuteAsync(async _ =>
|
|
{
|
|
attempts++;
|
|
await Task.Yield();
|
|
throw new InvalidOperationException("boom");
|
|
});
|
|
});
|
|
|
|
attempts.ShouldBe(1);
|
|
ex.Message.ShouldBe("boom");
|
|
}
|
|
|
|
[Fact]
|
|
public async Task AlarmAcknowledge_DoesNotRetry_OnFailure()
|
|
{
|
|
var builder = new DriverResiliencePipelineBuilder();
|
|
var pipeline = builder.GetOrCreate(Guid.NewGuid(), "host-1", DriverCapability.AlarmAcknowledge, TierAOptions);
|
|
var attempts = 0;
|
|
|
|
await Should.ThrowAsync<InvalidOperationException>(async () =>
|
|
{
|
|
await pipeline.ExecuteAsync(async _ =>
|
|
{
|
|
attempts++;
|
|
await Task.Yield();
|
|
throw new InvalidOperationException("boom");
|
|
});
|
|
});
|
|
|
|
attempts.ShouldBe(1);
|
|
}
|
|
|
|
[Fact]
|
|
public void Pipeline_IsIsolated_PerHost()
|
|
{
|
|
var builder = new DriverResiliencePipelineBuilder();
|
|
var driverId = Guid.NewGuid();
|
|
|
|
var hostA = builder.GetOrCreate(driverId, "host-a", DriverCapability.Read, TierAOptions);
|
|
var hostB = builder.GetOrCreate(driverId, "host-b", DriverCapability.Read, TierAOptions);
|
|
|
|
hostA.ShouldNotBeSameAs(hostB);
|
|
builder.CachedPipelineCount.ShouldBe(2);
|
|
}
|
|
|
|
[Fact]
|
|
public void Pipeline_IsReused_ForSameTriple()
|
|
{
|
|
var builder = new DriverResiliencePipelineBuilder();
|
|
var driverId = Guid.NewGuid();
|
|
|
|
var first = builder.GetOrCreate(driverId, "host-a", DriverCapability.Read, TierAOptions);
|
|
var second = builder.GetOrCreate(driverId, "host-a", DriverCapability.Read, TierAOptions);
|
|
|
|
first.ShouldBeSameAs(second);
|
|
builder.CachedPipelineCount.ShouldBe(1);
|
|
}
|
|
|
|
[Fact]
|
|
public void Pipeline_IsIsolated_PerCapability()
|
|
{
|
|
var builder = new DriverResiliencePipelineBuilder();
|
|
var driverId = Guid.NewGuid();
|
|
|
|
var read = builder.GetOrCreate(driverId, "host-a", DriverCapability.Read, TierAOptions);
|
|
var write = builder.GetOrCreate(driverId, "host-a", DriverCapability.Write, TierAOptions);
|
|
|
|
read.ShouldNotBeSameAs(write);
|
|
}
|
|
|
|
[Fact]
|
|
public async Task DeadHost_DoesNotOpenBreaker_ForSiblingHost()
|
|
{
|
|
var builder = new DriverResiliencePipelineBuilder();
|
|
var driverId = Guid.NewGuid();
|
|
|
|
var deadHost = builder.GetOrCreate(driverId, "dead-plc", DriverCapability.Read, TierAOptions);
|
|
var liveHost = builder.GetOrCreate(driverId, "live-plc", DriverCapability.Read, TierAOptions);
|
|
|
|
var threshold = TierAOptions.Resolve(DriverCapability.Read).BreakerFailureThreshold;
|
|
for (var i = 0; i < threshold + 5; i++)
|
|
{
|
|
await Should.ThrowAsync<Exception>(async () =>
|
|
await deadHost.ExecuteAsync(async _ =>
|
|
{
|
|
await Task.Yield();
|
|
throw new InvalidOperationException("dead plc");
|
|
}));
|
|
}
|
|
|
|
var liveAttempts = 0;
|
|
await liveHost.ExecuteAsync(async _ =>
|
|
{
|
|
liveAttempts++;
|
|
await Task.Yield();
|
|
});
|
|
|
|
liveAttempts.ShouldBe(1, "healthy sibling host must not be affected by dead peer");
|
|
}
|
|
|
|
[Fact]
|
|
public async Task CircuitBreaker_Opens_AfterFailureThreshold_OnTierA()
|
|
{
|
|
var builder = new DriverResiliencePipelineBuilder();
|
|
var pipeline = builder.GetOrCreate(Guid.NewGuid(), "host-1", DriverCapability.Write, TierAOptions);
|
|
|
|
var threshold = TierAOptions.Resolve(DriverCapability.Write).BreakerFailureThreshold;
|
|
for (var i = 0; i < threshold; i++)
|
|
{
|
|
await Should.ThrowAsync<InvalidOperationException>(async () =>
|
|
await pipeline.ExecuteAsync(async _ =>
|
|
{
|
|
await Task.Yield();
|
|
throw new InvalidOperationException("boom");
|
|
}));
|
|
}
|
|
|
|
await Should.ThrowAsync<BrokenCircuitException>(async () =>
|
|
await pipeline.ExecuteAsync(async _ =>
|
|
{
|
|
await Task.Yield();
|
|
}));
|
|
}
|
|
|
|
[Fact]
|
|
public async Task Timeout_Cancels_SlowOperation()
|
|
{
|
|
var tierAWithShortTimeout = new DriverResilienceOptions
|
|
{
|
|
Tier = DriverTier.A,
|
|
CapabilityPolicies = new Dictionary<DriverCapability, CapabilityPolicy>
|
|
{
|
|
[DriverCapability.Read] = new(TimeoutSeconds: 1, RetryCount: 0, BreakerFailureThreshold: 5),
|
|
},
|
|
};
|
|
var builder = new DriverResiliencePipelineBuilder();
|
|
var pipeline = builder.GetOrCreate(Guid.NewGuid(), "host-1", DriverCapability.Read, tierAWithShortTimeout);
|
|
|
|
await Should.ThrowAsync<TimeoutRejectedException>(async () =>
|
|
await pipeline.ExecuteAsync(async ct =>
|
|
{
|
|
await Task.Delay(TimeSpan.FromSeconds(5), ct);
|
|
}));
|
|
}
|
|
|
|
[Fact]
|
|
public void Invalidate_Removes_OnlyMatchingInstance()
|
|
{
|
|
var builder = new DriverResiliencePipelineBuilder();
|
|
var keepId = Guid.NewGuid();
|
|
var dropId = Guid.NewGuid();
|
|
|
|
builder.GetOrCreate(keepId, "h", DriverCapability.Read, TierAOptions);
|
|
builder.GetOrCreate(keepId, "h", DriverCapability.Write, TierAOptions);
|
|
builder.GetOrCreate(dropId, "h", DriverCapability.Read, TierAOptions);
|
|
|
|
var removed = builder.Invalidate(dropId);
|
|
|
|
removed.ShouldBe(1);
|
|
builder.CachedPipelineCount.ShouldBe(2);
|
|
}
|
|
|
|
[Fact]
|
|
public async Task Cancellation_IsNot_Retried()
|
|
{
|
|
var builder = new DriverResiliencePipelineBuilder();
|
|
var pipeline = builder.GetOrCreate(Guid.NewGuid(), "host-1", DriverCapability.Read, TierAOptions);
|
|
var attempts = 0;
|
|
using var cts = new CancellationTokenSource();
|
|
cts.Cancel();
|
|
|
|
await Should.ThrowAsync<OperationCanceledException>(async () =>
|
|
await pipeline.ExecuteAsync(async ct =>
|
|
{
|
|
attempts++;
|
|
ct.ThrowIfCancellationRequested();
|
|
await Task.Yield();
|
|
}, cts.Token));
|
|
|
|
attempts.ShouldBeLessThanOrEqualTo(1);
|
|
}
|
|
}
|