Phase 6.3 Stream B + Stream D (core) — ServiceLevelCalculator + RecoveryStateManager + ApplyLeaseRegistry
Lands the pure-logic heart of Phase 6.3. OPC UA node wiring (Stream C), RedundancyCoordinator topology loader (Stream A), Admin UI + metrics (Stream E), and client interop tests (Stream F) are follow-up work — tracked as tasks #145-150. New Server.Redundancy sub-namespace: - ServiceLevelCalculator — pure 8-state matrix per decision #154. Inputs: role, selfHealthy, peerUa/HttpHealthy, applyInProgress, recoveryDwellMet, topologyValid, operatorMaintenance. Output: OPC UA Part 5 §6.3.34 Byte. Reserved bands (0=Maintenance, 1=NoData, 2=InvalidTopology) override everything; operational bands occupy 30..255. Key invariants: * Authoritative-Primary = 255, Authoritative-Backup = 100. * Isolated-Primary = 230 (retains authority with peer down). * Isolated-Backup = 80 (does NOT auto-promote — non-transparent model). * Primary-Mid-Apply = 200, Backup-Mid-Apply = 50; apply dominates peer-unreachable per Stream C.4 integration expectation. * Recovering-Primary = 180, Recovering-Backup = 30. * Standalone treats healthy as Authoritative-Primary (no peer concept). - ServiceLevelBand enum — labels every numeric band for logs + Admin UI. Values match the calculator table exactly; compliance script asserts drift detection. - RecoveryStateManager — holds Recovering band until (dwell ≥ 60s default) AND (one publish witness observed). Re-fault resets both gates so a flapping node doesn't shortcut through recovery twice. - ApplyLeaseRegistry — keyed on (ConfigGenerationId, PublishRequestId) per decision #162. BeginApplyLease returns an IAsyncDisposable so every exit path (success, exception, cancellation, dispose-twice) closes the lease. ApplyMaxDuration watchdog (10 min default) via PruneStale tick forces close after a crashed publisher so ServiceLevel can't stick at mid-apply. Tests (40 new, all pass): - ServiceLevelCalculatorTests (27): reserved bands override; self-unhealthy → NoData; invalid topology demotes both nodes to 2; authoritative primary 255; backup 100; isolated primary 230 retains authority; isolated backup 80 does not promote; http-only unreachable triggers isolated; mid-apply primary 200; mid-apply backup 50; apply dominates peer-unreachable; recovering primary 180; recovering backup 30; standalone treats healthy as 255; classify round-trips every band including Unknown sentinel. - RecoveryStateManagerTests (6): never-faulted auto-meets dwell; faulted-only returns true (semantics-doc test — coordinator short-circuits on selfHealthy=false); recovered without witness never meets; witness without dwell never meets; witness + dwell-elapsed meets; re-fault resets. - ApplyLeaseRegistryTests (7): empty registry not-in-progress; begin+dispose closes; dispose on exception still closes; dispose twice safe; concurrent leases isolated; watchdog closes stale; watchdog leaves recent alone. Full solution dotnet test: 1137 passing (Phase 6.2 shipped at 1097, Phase 6.3 B + D core = +40 = 1137). Pre-existing Client.CLI Subscribe flake unchanged. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
118
tests/ZB.MOM.WW.OtOpcUa.Server.Tests/ApplyLeaseRegistryTests.cs
Normal file
118
tests/ZB.MOM.WW.OtOpcUa.Server.Tests/ApplyLeaseRegistryTests.cs
Normal file
@@ -0,0 +1,118 @@
|
||||
using Shouldly;
|
||||
using Xunit;
|
||||
using ZB.MOM.WW.OtOpcUa.Server.Redundancy;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Server.Tests;
|
||||
|
||||
[Trait("Category", "Unit")]
|
||||
public sealed class ApplyLeaseRegistryTests
|
||||
{
|
||||
private static readonly DateTime T0 = new(2026, 4, 19, 12, 0, 0, DateTimeKind.Utc);
|
||||
|
||||
private sealed class FakeTimeProvider : TimeProvider
|
||||
{
|
||||
public DateTime Utc { get; set; } = T0;
|
||||
public override DateTimeOffset GetUtcNow() => new(Utc, TimeSpan.Zero);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task EmptyRegistry_NotInProgress()
|
||||
{
|
||||
var reg = new ApplyLeaseRegistry();
|
||||
reg.IsApplyInProgress.ShouldBeFalse();
|
||||
await Task.Yield();
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task BeginAndDispose_ClosesLease()
|
||||
{
|
||||
var reg = new ApplyLeaseRegistry();
|
||||
|
||||
await using (reg.BeginApplyLease(1, Guid.NewGuid()))
|
||||
{
|
||||
reg.IsApplyInProgress.ShouldBeTrue();
|
||||
reg.OpenLeaseCount.ShouldBe(1);
|
||||
}
|
||||
|
||||
reg.IsApplyInProgress.ShouldBeFalse();
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Dispose_OnException_StillCloses()
|
||||
{
|
||||
var reg = new ApplyLeaseRegistry();
|
||||
var publishId = Guid.NewGuid();
|
||||
|
||||
await Should.ThrowAsync<InvalidOperationException>(async () =>
|
||||
{
|
||||
await using var lease = reg.BeginApplyLease(1, publishId);
|
||||
throw new InvalidOperationException("publish failed");
|
||||
});
|
||||
|
||||
reg.IsApplyInProgress.ShouldBeFalse("await-using semantics must close the lease on exception");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Dispose_TwiceIsSafe()
|
||||
{
|
||||
var reg = new ApplyLeaseRegistry();
|
||||
var lease = reg.BeginApplyLease(1, Guid.NewGuid());
|
||||
|
||||
await lease.DisposeAsync();
|
||||
await lease.DisposeAsync();
|
||||
|
||||
reg.IsApplyInProgress.ShouldBeFalse();
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task MultipleLeases_Concurrent_StayIsolated()
|
||||
{
|
||||
var reg = new ApplyLeaseRegistry();
|
||||
var id1 = Guid.NewGuid();
|
||||
var id2 = Guid.NewGuid();
|
||||
|
||||
await using var lease1 = reg.BeginApplyLease(1, id1);
|
||||
await using var lease2 = reg.BeginApplyLease(2, id2);
|
||||
|
||||
reg.OpenLeaseCount.ShouldBe(2);
|
||||
await lease1.DisposeAsync();
|
||||
reg.IsApplyInProgress.ShouldBeTrue("lease2 still open");
|
||||
await lease2.DisposeAsync();
|
||||
reg.IsApplyInProgress.ShouldBeFalse();
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Watchdog_ClosesStaleLeases()
|
||||
{
|
||||
var clock = new FakeTimeProvider();
|
||||
var reg = new ApplyLeaseRegistry(applyMaxDuration: TimeSpan.FromMinutes(10), timeProvider: clock);
|
||||
|
||||
_ = reg.BeginApplyLease(1, Guid.NewGuid()); // intentional leak; not awaited / disposed
|
||||
|
||||
// Lease still young → no-op.
|
||||
clock.Utc = T0.AddMinutes(5);
|
||||
reg.PruneStale().ShouldBe(0);
|
||||
reg.IsApplyInProgress.ShouldBeTrue();
|
||||
|
||||
// Past the watchdog horizon → force-close.
|
||||
clock.Utc = T0.AddMinutes(11);
|
||||
var closed = reg.PruneStale();
|
||||
|
||||
closed.ShouldBe(1);
|
||||
reg.IsApplyInProgress.ShouldBeFalse("ServiceLevel can't stick at mid-apply after a crashed publisher");
|
||||
await Task.Yield();
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Watchdog_LeavesRecentLeaseAlone()
|
||||
{
|
||||
var clock = new FakeTimeProvider();
|
||||
var reg = new ApplyLeaseRegistry(applyMaxDuration: TimeSpan.FromMinutes(10), timeProvider: clock);
|
||||
|
||||
await using var lease = reg.BeginApplyLease(1, Guid.NewGuid());
|
||||
clock.Utc = T0.AddMinutes(3);
|
||||
|
||||
reg.PruneStale().ShouldBe(0);
|
||||
reg.IsApplyInProgress.ShouldBeTrue();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,92 @@
|
||||
using Shouldly;
|
||||
using Xunit;
|
||||
using ZB.MOM.WW.OtOpcUa.Server.Redundancy;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Server.Tests;
|
||||
|
||||
[Trait("Category", "Unit")]
|
||||
public sealed class RecoveryStateManagerTests
|
||||
{
|
||||
private static readonly DateTime T0 = new(2026, 4, 19, 12, 0, 0, DateTimeKind.Utc);
|
||||
|
||||
private sealed class FakeTimeProvider : TimeProvider
|
||||
{
|
||||
public DateTime Utc { get; set; } = T0;
|
||||
public override DateTimeOffset GetUtcNow() => new(Utc, TimeSpan.Zero);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void NeverFaulted_DwellIsAutomaticallyMet()
|
||||
{
|
||||
var mgr = new RecoveryStateManager();
|
||||
mgr.IsDwellMet().ShouldBeTrue();
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void AfterFault_Only_IsDwellMet_Returns_True_ButCallerDoesntQueryDuringFaulted()
|
||||
{
|
||||
// Documented semantics: IsDwellMet is only consulted when selfHealthy=true (i.e. the
|
||||
// node has recovered into Healthy). During Faulted the coordinator short-circuits on
|
||||
// the self-health check and never calls IsDwellMet. So returning true here is harmless;
|
||||
// the test captures the intent so a future "return false during Faulted" tweak has to
|
||||
// deliberately change this test first.
|
||||
var mgr = new RecoveryStateManager();
|
||||
mgr.MarkFaulted();
|
||||
mgr.IsDwellMet().ShouldBeTrue();
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void AfterRecovery_NoWitness_DwellNotMet_EvenAfterElapsed()
|
||||
{
|
||||
var clock = new FakeTimeProvider();
|
||||
var mgr = new RecoveryStateManager(dwellTime: TimeSpan.FromSeconds(60), timeProvider: clock);
|
||||
mgr.MarkFaulted();
|
||||
mgr.MarkRecovered();
|
||||
clock.Utc = T0.AddSeconds(120);
|
||||
|
||||
mgr.IsDwellMet().ShouldBeFalse("dwell elapsed but no publish witness — must NOT escape Recovering band");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void AfterRecovery_WitnessButTooSoon_DwellNotMet()
|
||||
{
|
||||
var clock = new FakeTimeProvider();
|
||||
var mgr = new RecoveryStateManager(dwellTime: TimeSpan.FromSeconds(60), timeProvider: clock);
|
||||
mgr.MarkFaulted();
|
||||
mgr.MarkRecovered();
|
||||
mgr.RecordPublishWitness();
|
||||
clock.Utc = T0.AddSeconds(30);
|
||||
|
||||
mgr.IsDwellMet().ShouldBeFalse("witness ok but dwell 30s < 60s");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void AfterRecovery_Witness_And_DwellElapsed_Met()
|
||||
{
|
||||
var clock = new FakeTimeProvider();
|
||||
var mgr = new RecoveryStateManager(dwellTime: TimeSpan.FromSeconds(60), timeProvider: clock);
|
||||
mgr.MarkFaulted();
|
||||
mgr.MarkRecovered();
|
||||
mgr.RecordPublishWitness();
|
||||
clock.Utc = T0.AddSeconds(61);
|
||||
|
||||
mgr.IsDwellMet().ShouldBeTrue();
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void ReFault_ResetsWitness_AndDwellClock()
|
||||
{
|
||||
var clock = new FakeTimeProvider();
|
||||
var mgr = new RecoveryStateManager(dwellTime: TimeSpan.FromSeconds(60), timeProvider: clock);
|
||||
mgr.MarkFaulted();
|
||||
mgr.MarkRecovered();
|
||||
mgr.RecordPublishWitness();
|
||||
clock.Utc = T0.AddSeconds(61);
|
||||
mgr.IsDwellMet().ShouldBeTrue();
|
||||
|
||||
mgr.MarkFaulted();
|
||||
mgr.MarkRecovered();
|
||||
clock.Utc = T0.AddSeconds(100); // re-entered Recovering, no new witness
|
||||
mgr.IsDwellMet().ShouldBeFalse("new recovery needs its own witness");
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,217 @@
|
||||
using Shouldly;
|
||||
using Xunit;
|
||||
using ZB.MOM.WW.OtOpcUa.Configuration.Enums;
|
||||
using ZB.MOM.WW.OtOpcUa.Server.Redundancy;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Server.Tests;
|
||||
|
||||
[Trait("Category", "Unit")]
|
||||
public sealed class ServiceLevelCalculatorTests
|
||||
{
|
||||
// --- Reserved bands (0, 1, 2) ---
|
||||
|
||||
[Fact]
|
||||
public void OperatorMaintenance_Overrides_Everything()
|
||||
{
|
||||
var v = ServiceLevelCalculator.Compute(
|
||||
RedundancyRole.Primary,
|
||||
selfHealthy: true, peerUaHealthy: true, peerHttpHealthy: true,
|
||||
applyInProgress: false, recoveryDwellMet: true, topologyValid: true,
|
||||
operatorMaintenance: true);
|
||||
|
||||
v.ShouldBe((byte)ServiceLevelBand.Maintenance);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void UnhealthySelf_ReturnsNoData()
|
||||
{
|
||||
var v = ServiceLevelCalculator.Compute(
|
||||
RedundancyRole.Primary,
|
||||
selfHealthy: false, peerUaHealthy: true, peerHttpHealthy: true,
|
||||
applyInProgress: false, recoveryDwellMet: true, topologyValid: true);
|
||||
|
||||
v.ShouldBe((byte)ServiceLevelBand.NoData);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void InvalidTopology_Demotes_BothNodes_To_2()
|
||||
{
|
||||
var primary = ServiceLevelCalculator.Compute(
|
||||
RedundancyRole.Primary,
|
||||
selfHealthy: true, peerUaHealthy: true, peerHttpHealthy: true,
|
||||
applyInProgress: false, recoveryDwellMet: true, topologyValid: false);
|
||||
var secondary = ServiceLevelCalculator.Compute(
|
||||
RedundancyRole.Secondary,
|
||||
selfHealthy: true, peerUaHealthy: true, peerHttpHealthy: true,
|
||||
applyInProgress: false, recoveryDwellMet: true, topologyValid: false);
|
||||
|
||||
primary.ShouldBe((byte)ServiceLevelBand.InvalidTopology);
|
||||
secondary.ShouldBe((byte)ServiceLevelBand.InvalidTopology);
|
||||
}
|
||||
|
||||
// --- Operational bands (authoritative) ---
|
||||
|
||||
[Fact]
|
||||
public void Authoritative_Primary_Is_255()
|
||||
{
|
||||
var v = ServiceLevelCalculator.Compute(
|
||||
RedundancyRole.Primary,
|
||||
selfHealthy: true, peerUaHealthy: true, peerHttpHealthy: true,
|
||||
applyInProgress: false, recoveryDwellMet: true, topologyValid: true);
|
||||
|
||||
v.ShouldBe((byte)ServiceLevelBand.AuthoritativePrimary);
|
||||
v.ShouldBe((byte)255);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Authoritative_Backup_Is_100()
|
||||
{
|
||||
var v = ServiceLevelCalculator.Compute(
|
||||
RedundancyRole.Secondary,
|
||||
selfHealthy: true, peerUaHealthy: true, peerHttpHealthy: true,
|
||||
applyInProgress: false, recoveryDwellMet: true, topologyValid: true);
|
||||
|
||||
v.ShouldBe((byte)100);
|
||||
}
|
||||
|
||||
// --- Isolated bands ---
|
||||
|
||||
[Fact]
|
||||
public void IsolatedPrimary_PeerUnreachable_Is_230_RetainsAuthority()
|
||||
{
|
||||
var v = ServiceLevelCalculator.Compute(
|
||||
RedundancyRole.Primary,
|
||||
selfHealthy: true, peerUaHealthy: false, peerHttpHealthy: true,
|
||||
applyInProgress: false, recoveryDwellMet: true, topologyValid: true);
|
||||
|
||||
v.ShouldBe((byte)230);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void IsolatedBackup_PrimaryUnreachable_Is_80_DoesNotPromote()
|
||||
{
|
||||
var v = ServiceLevelCalculator.Compute(
|
||||
RedundancyRole.Secondary,
|
||||
selfHealthy: true, peerUaHealthy: false, peerHttpHealthy: false,
|
||||
applyInProgress: false, recoveryDwellMet: true, topologyValid: true);
|
||||
|
||||
v.ShouldBe((byte)80, "Backup isolates at 80 — doesn't auto-promote to 255");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void HttpOnly_Unreachable_TriggersIsolated()
|
||||
{
|
||||
// Either probe failing marks peer unreachable — UA probe is authoritative but HTTP is
|
||||
// the fast-fail short-circuit; either missing means "not a valid peer right now".
|
||||
var v = ServiceLevelCalculator.Compute(
|
||||
RedundancyRole.Primary,
|
||||
selfHealthy: true, peerUaHealthy: true, peerHttpHealthy: false,
|
||||
applyInProgress: false, recoveryDwellMet: true, topologyValid: true);
|
||||
|
||||
v.ShouldBe((byte)230);
|
||||
}
|
||||
|
||||
// --- Apply-mid bands ---
|
||||
|
||||
[Fact]
|
||||
public void PrimaryMidApply_Is_200()
|
||||
{
|
||||
var v = ServiceLevelCalculator.Compute(
|
||||
RedundancyRole.Primary,
|
||||
selfHealthy: true, peerUaHealthy: true, peerHttpHealthy: true,
|
||||
applyInProgress: true, recoveryDwellMet: true, topologyValid: true);
|
||||
|
||||
v.ShouldBe((byte)200);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void BackupMidApply_Is_50()
|
||||
{
|
||||
var v = ServiceLevelCalculator.Compute(
|
||||
RedundancyRole.Secondary,
|
||||
selfHealthy: true, peerUaHealthy: true, peerHttpHealthy: true,
|
||||
applyInProgress: true, recoveryDwellMet: true, topologyValid: true);
|
||||
|
||||
v.ShouldBe((byte)50);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void ApplyInProgress_Dominates_PeerUnreachable()
|
||||
{
|
||||
// Per Stream C.4 integration-test expectation: mid-apply + peer down → apply wins (200).
|
||||
var v = ServiceLevelCalculator.Compute(
|
||||
RedundancyRole.Primary,
|
||||
selfHealthy: true, peerUaHealthy: false, peerHttpHealthy: false,
|
||||
applyInProgress: true, recoveryDwellMet: true, topologyValid: true);
|
||||
|
||||
v.ShouldBe((byte)200);
|
||||
}
|
||||
|
||||
// --- Recovering bands ---
|
||||
|
||||
[Fact]
|
||||
public void RecoveringPrimary_Is_180()
|
||||
{
|
||||
var v = ServiceLevelCalculator.Compute(
|
||||
RedundancyRole.Primary,
|
||||
selfHealthy: true, peerUaHealthy: true, peerHttpHealthy: true,
|
||||
applyInProgress: false, recoveryDwellMet: false, topologyValid: true);
|
||||
|
||||
v.ShouldBe((byte)180);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void RecoveringBackup_Is_30()
|
||||
{
|
||||
var v = ServiceLevelCalculator.Compute(
|
||||
RedundancyRole.Secondary,
|
||||
selfHealthy: true, peerUaHealthy: true, peerHttpHealthy: true,
|
||||
applyInProgress: false, recoveryDwellMet: false, topologyValid: true);
|
||||
|
||||
v.ShouldBe((byte)30);
|
||||
}
|
||||
|
||||
// --- Standalone node (no peer) ---
|
||||
|
||||
[Fact]
|
||||
public void Standalone_IsAuthoritativePrimary_WhenHealthy()
|
||||
{
|
||||
var v = ServiceLevelCalculator.Compute(
|
||||
RedundancyRole.Standalone,
|
||||
selfHealthy: true, peerUaHealthy: false, peerHttpHealthy: false,
|
||||
applyInProgress: false, recoveryDwellMet: true, topologyValid: true);
|
||||
|
||||
v.ShouldBe((byte)255, "Standalone has no peer — treat healthy as authoritative");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Standalone_MidApply_Is_200()
|
||||
{
|
||||
var v = ServiceLevelCalculator.Compute(
|
||||
RedundancyRole.Standalone,
|
||||
selfHealthy: true, peerUaHealthy: false, peerHttpHealthy: false,
|
||||
applyInProgress: true, recoveryDwellMet: true, topologyValid: true);
|
||||
|
||||
v.ShouldBe((byte)200);
|
||||
}
|
||||
|
||||
// --- Classify round-trip ---
|
||||
|
||||
[Theory]
|
||||
[InlineData((byte)0, ServiceLevelBand.Maintenance)]
|
||||
[InlineData((byte)1, ServiceLevelBand.NoData)]
|
||||
[InlineData((byte)2, ServiceLevelBand.InvalidTopology)]
|
||||
[InlineData((byte)30, ServiceLevelBand.RecoveringBackup)]
|
||||
[InlineData((byte)50, ServiceLevelBand.BackupMidApply)]
|
||||
[InlineData((byte)80, ServiceLevelBand.IsolatedBackup)]
|
||||
[InlineData((byte)100, ServiceLevelBand.AuthoritativeBackup)]
|
||||
[InlineData((byte)180, ServiceLevelBand.RecoveringPrimary)]
|
||||
[InlineData((byte)200, ServiceLevelBand.PrimaryMidApply)]
|
||||
[InlineData((byte)230, ServiceLevelBand.IsolatedPrimary)]
|
||||
[InlineData((byte)255, ServiceLevelBand.AuthoritativePrimary)]
|
||||
[InlineData((byte)123, ServiceLevelBand.Unknown)]
|
||||
public void Classify_RoundTrips_EveryBand(byte value, ServiceLevelBand expected)
|
||||
{
|
||||
ServiceLevelCalculator.Classify(value).ShouldBe(expected);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user