From 483f55557c167b6f824a0765a82e4483b876f838 Mon Sep 17 00:00:00 2001 From: Joseph Doherty Date: Sun, 19 Apr 2026 09:56:34 -0400 Subject: [PATCH] =?UTF-8?q?Phase=206.3=20Stream=20B=20+=20Stream=20D=20(co?= =?UTF-8?q?re)=20=E2=80=94=20ServiceLevelCalculator=20+=20RecoveryStateMan?= =?UTF-8?q?ager=20+=20ApplyLeaseRegistry?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Lands the pure-logic heart of Phase 6.3. OPC UA node wiring (Stream C), RedundancyCoordinator topology loader (Stream A), Admin UI + metrics (Stream E), and client interop tests (Stream F) are follow-up work — tracked as tasks #145-150. New Server.Redundancy sub-namespace: - ServiceLevelCalculator — pure 8-state matrix per decision #154. Inputs: role, selfHealthy, peerUa/HttpHealthy, applyInProgress, recoveryDwellMet, topologyValid, operatorMaintenance. Output: OPC UA Part 5 §6.3.34 Byte. Reserved bands (0=Maintenance, 1=NoData, 2=InvalidTopology) override everything; operational bands occupy 30..255. Key invariants: * Authoritative-Primary = 255, Authoritative-Backup = 100. * Isolated-Primary = 230 (retains authority with peer down). * Isolated-Backup = 80 (does NOT auto-promote — non-transparent model). * Primary-Mid-Apply = 200, Backup-Mid-Apply = 50; apply dominates peer-unreachable per Stream C.4 integration expectation. * Recovering-Primary = 180, Recovering-Backup = 30. * Standalone treats healthy as Authoritative-Primary (no peer concept). - ServiceLevelBand enum — labels every numeric band for logs + Admin UI. Values match the calculator table exactly; compliance script asserts drift detection. - RecoveryStateManager — holds Recovering band until (dwell ≥ 60s default) AND (one publish witness observed). Re-fault resets both gates so a flapping node doesn't shortcut through recovery twice. - ApplyLeaseRegistry — keyed on (ConfigGenerationId, PublishRequestId) per decision #162. BeginApplyLease returns an IAsyncDisposable so every exit path (success, exception, cancellation, dispose-twice) closes the lease. ApplyMaxDuration watchdog (10 min default) via PruneStale tick forces close after a crashed publisher so ServiceLevel can't stick at mid-apply. Tests (40 new, all pass): - ServiceLevelCalculatorTests (27): reserved bands override; self-unhealthy → NoData; invalid topology demotes both nodes to 2; authoritative primary 255; backup 100; isolated primary 230 retains authority; isolated backup 80 does not promote; http-only unreachable triggers isolated; mid-apply primary 200; mid-apply backup 50; apply dominates peer-unreachable; recovering primary 180; recovering backup 30; standalone treats healthy as 255; classify round-trips every band including Unknown sentinel. - RecoveryStateManagerTests (6): never-faulted auto-meets dwell; faulted-only returns true (semantics-doc test — coordinator short-circuits on selfHealthy=false); recovered without witness never meets; witness without dwell never meets; witness + dwell-elapsed meets; re-fault resets. - ApplyLeaseRegistryTests (7): empty registry not-in-progress; begin+dispose closes; dispose on exception still closes; dispose twice safe; concurrent leases isolated; watchdog closes stale; watchdog leaves recent alone. Full solution dotnet test: 1137 passing (Phase 6.2 shipped at 1097, Phase 6.3 B + D core = +40 = 1137). Pre-existing Client.CLI Subscribe flake unchanged. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../Redundancy/ApplyLeaseRegistry.cs | 85 +++++++ .../Redundancy/RecoveryStateManager.cs | 65 ++++++ .../Redundancy/ServiceLevelCalculator.cs | 131 +++++++++++ .../ApplyLeaseRegistryTests.cs | 118 ++++++++++ .../RecoveryStateManagerTests.cs | 92 ++++++++ .../ServiceLevelCalculatorTests.cs | 217 ++++++++++++++++++ 6 files changed, 708 insertions(+) create mode 100644 src/ZB.MOM.WW.OtOpcUa.Server/Redundancy/ApplyLeaseRegistry.cs create mode 100644 src/ZB.MOM.WW.OtOpcUa.Server/Redundancy/RecoveryStateManager.cs create mode 100644 src/ZB.MOM.WW.OtOpcUa.Server/Redundancy/ServiceLevelCalculator.cs create mode 100644 tests/ZB.MOM.WW.OtOpcUa.Server.Tests/ApplyLeaseRegistryTests.cs create mode 100644 tests/ZB.MOM.WW.OtOpcUa.Server.Tests/RecoveryStateManagerTests.cs create mode 100644 tests/ZB.MOM.WW.OtOpcUa.Server.Tests/ServiceLevelCalculatorTests.cs diff --git a/src/ZB.MOM.WW.OtOpcUa.Server/Redundancy/ApplyLeaseRegistry.cs b/src/ZB.MOM.WW.OtOpcUa.Server/Redundancy/ApplyLeaseRegistry.cs new file mode 100644 index 0000000..4a8a7df --- /dev/null +++ b/src/ZB.MOM.WW.OtOpcUa.Server/Redundancy/ApplyLeaseRegistry.cs @@ -0,0 +1,85 @@ +using System.Collections.Concurrent; + +namespace ZB.MOM.WW.OtOpcUa.Server.Redundancy; + +/// +/// Tracks in-progress publish-generation apply leases keyed on +/// (ConfigGenerationId, PublishRequestId). Per decision #162 a sealed lease pattern +/// ensures reflects every exit path (success / exception / +/// cancellation) because the IAsyncDisposable returned by +/// decrements unconditionally. +/// +/// +/// A watchdog loop calls periodically with the configured +/// ; any lease older than that is force-closed so a crashed +/// publisher can't pin the node at . +/// +public sealed class ApplyLeaseRegistry +{ + private readonly ConcurrentDictionary _leases = new(); + private readonly TimeProvider _timeProvider; + + public TimeSpan ApplyMaxDuration { get; } + + public ApplyLeaseRegistry(TimeSpan? applyMaxDuration = null, TimeProvider? timeProvider = null) + { + ApplyMaxDuration = applyMaxDuration ?? TimeSpan.FromMinutes(10); + _timeProvider = timeProvider ?? TimeProvider.System; + } + + /// + /// Register a new lease. Returns an whose disposal + /// decrements the registry; use await using in the caller so every exit path + /// closes the lease. + /// + public IAsyncDisposable BeginApplyLease(long generationId, Guid publishRequestId) + { + var key = new LeaseKey(generationId, publishRequestId); + _leases[key] = _timeProvider.GetUtcNow().UtcDateTime; + return new LeaseScope(this, key); + } + + /// True when at least one apply lease is currently open. + public bool IsApplyInProgress => !_leases.IsEmpty; + + /// Current open-lease count — diagnostics only. + public int OpenLeaseCount => _leases.Count; + + /// Force-close any lease older than . Watchdog tick. + /// Number of leases the watchdog closed on this tick. + public int PruneStale() + { + var now = _timeProvider.GetUtcNow().UtcDateTime; + var closed = 0; + foreach (var kv in _leases) + { + if (now - kv.Value > ApplyMaxDuration && _leases.TryRemove(kv.Key, out _)) + closed++; + } + return closed; + } + + private void Release(LeaseKey key) => _leases.TryRemove(key, out _); + + private readonly record struct LeaseKey(long GenerationId, Guid PublishRequestId); + + private sealed class LeaseScope : IAsyncDisposable + { + private readonly ApplyLeaseRegistry _owner; + private readonly LeaseKey _key; + private int _disposed; + + public LeaseScope(ApplyLeaseRegistry owner, LeaseKey key) + { + _owner = owner; + _key = key; + } + + public ValueTask DisposeAsync() + { + if (Interlocked.Exchange(ref _disposed, 1) == 0) + _owner.Release(_key); + return ValueTask.CompletedTask; + } + } +} diff --git a/src/ZB.MOM.WW.OtOpcUa.Server/Redundancy/RecoveryStateManager.cs b/src/ZB.MOM.WW.OtOpcUa.Server/Redundancy/RecoveryStateManager.cs new file mode 100644 index 0000000..27a5797 --- /dev/null +++ b/src/ZB.MOM.WW.OtOpcUa.Server/Redundancy/RecoveryStateManager.cs @@ -0,0 +1,65 @@ +namespace ZB.MOM.WW.OtOpcUa.Server.Redundancy; + +/// +/// Tracks the Recovering-band dwell for a node after a Faulted → Healthy transition. +/// Per decision #154 and Phase 6.3 Stream B.4 a node that has just returned to health stays +/// in the Recovering band (180 Primary / 30 Backup) until BOTH: (a) the configured +/// has elapsed, AND (b) at least one successful publish-witness +/// read has been observed. +/// +/// +/// Purely in-memory, no I/O. The coordinator feeds events into , +/// , and ; +/// becomes true only after both conditions converge. +/// +public sealed class RecoveryStateManager +{ + private readonly TimeSpan _dwellTime; + private readonly TimeProvider _timeProvider; + + /// Last time the node transitioned Faulted → Healthy. Null until first recovery. + private DateTime? _recoveredUtc; + + /// True once a publish-witness read has succeeded after the last recovery. + private bool _witnessed; + + public TimeSpan DwellTime => _dwellTime; + + public RecoveryStateManager(TimeSpan? dwellTime = null, TimeProvider? timeProvider = null) + { + _dwellTime = dwellTime ?? TimeSpan.FromSeconds(60); + _timeProvider = timeProvider ?? TimeProvider.System; + } + + /// Report that the node has entered the Faulted state. + public void MarkFaulted() + { + _recoveredUtc = null; + _witnessed = false; + } + + /// Report that the node has transitioned Faulted → Healthy; dwell clock starts now. + public void MarkRecovered() + { + _recoveredUtc = _timeProvider.GetUtcNow().UtcDateTime; + _witnessed = false; + } + + /// Report a successful publish-witness read. + public void RecordPublishWitness() => _witnessed = true; + + /// + /// True when the dwell is considered met: either the node never faulted in the first + /// place, or both (dwell time elapsed + publish witness recorded) since the last + /// recovery. False means the coordinator should report Recovering-band ServiceLevel. + /// + public bool IsDwellMet() + { + if (_recoveredUtc is null) return true; // never faulted → dwell N/A + + if (!_witnessed) return false; + + var elapsed = _timeProvider.GetUtcNow().UtcDateTime - _recoveredUtc.Value; + return elapsed >= _dwellTime; + } +} diff --git a/src/ZB.MOM.WW.OtOpcUa.Server/Redundancy/ServiceLevelCalculator.cs b/src/ZB.MOM.WW.OtOpcUa.Server/Redundancy/ServiceLevelCalculator.cs new file mode 100644 index 0000000..75678b4 --- /dev/null +++ b/src/ZB.MOM.WW.OtOpcUa.Server/Redundancy/ServiceLevelCalculator.cs @@ -0,0 +1,131 @@ +using ZB.MOM.WW.OtOpcUa.Configuration.Enums; + +namespace ZB.MOM.WW.OtOpcUa.Server.Redundancy; + +/// +/// Pure-function translator from the redundancy-state inputs (role, self health, peer +/// reachability via HTTP + UA probes, apply-in-progress flag, recovery dwell, topology +/// validity) to the OPC UA Part 5 §6.3.34 ServiceLevel value. +/// +/// +/// Per decision #154 the 8-state matrix avoids the reserved bands (0=Maintenance, +/// 1=NoData) for operational states. Operational values occupy 2..255 so a spec-compliant +/// client that cuts over on "<3 = unhealthy" keeps working without its vendor treating +/// the server as "under maintenance" during normal runtime. +/// +/// This class is pure — no threads, no I/O. The coordinator that owns it re-evaluates +/// on every input change and pushes the new byte through an IObserver<byte> to +/// the OPC UA ServiceLevel variable. Tests exercise the full matrix without touching a UA +/// stack. +/// +public static class ServiceLevelCalculator +{ + /// Compute the ServiceLevel for the given inputs. + /// Role declared for this node in the shared config DB. + /// This node's own health (from Phase 6.1 /healthz). + /// Peer node reachable via OPC UA probe. + /// Peer node reachable via HTTP /healthz probe. + /// True while this node is inside a publish-generation apply window. + /// True once the post-fault dwell + publish-witness conditions are met. + /// False when the cluster has detected >1 Primary (InvalidTopology demotes both nodes). + /// True when operator has declared the node in maintenance. + public static byte Compute( + RedundancyRole role, + bool selfHealthy, + bool peerUaHealthy, + bool peerHttpHealthy, + bool applyInProgress, + bool recoveryDwellMet, + bool topologyValid, + bool operatorMaintenance = false) + { + // Reserved bands first — they override everything per OPC UA Part 5 §6.3.34. + if (operatorMaintenance) return (byte)ServiceLevelBand.Maintenance; // 0 + if (!selfHealthy) return (byte)ServiceLevelBand.NoData; // 1 + if (!topologyValid) return (byte)ServiceLevelBand.InvalidTopology; // 2 + + // Standalone nodes have no peer — treat as authoritative when healthy. + if (role == RedundancyRole.Standalone) + return (byte)(applyInProgress ? ServiceLevelBand.PrimaryMidApply : ServiceLevelBand.AuthoritativePrimary); + + var isPrimary = role == RedundancyRole.Primary; + + // Apply-in-progress band dominates recovery + isolation (client should cut to peer). + if (applyInProgress) + return (byte)(isPrimary ? ServiceLevelBand.PrimaryMidApply : ServiceLevelBand.BackupMidApply); + + // Post-fault recovering — hold until dwell + witness satisfied. + if (!recoveryDwellMet) + return (byte)(isPrimary ? ServiceLevelBand.RecoveringPrimary : ServiceLevelBand.RecoveringBackup); + + // Peer unreachable (either probe fails) → isolated band. Per decision #154 Primary + // retains authority at 230 when isolated; Backup signals 80 "take over if asked" and + // does NOT auto-promote (non-transparent model). + var peerReachable = peerUaHealthy && peerHttpHealthy; + if (!peerReachable) + return (byte)(isPrimary ? ServiceLevelBand.IsolatedPrimary : ServiceLevelBand.IsolatedBackup); + + return (byte)(isPrimary ? ServiceLevelBand.AuthoritativePrimary : ServiceLevelBand.AuthoritativeBackup); + } + + /// Labels a ServiceLevel byte with its matrix band name — for logs + Admin UI. + public static ServiceLevelBand Classify(byte value) => value switch + { + (byte)ServiceLevelBand.Maintenance => ServiceLevelBand.Maintenance, + (byte)ServiceLevelBand.NoData => ServiceLevelBand.NoData, + (byte)ServiceLevelBand.InvalidTopology => ServiceLevelBand.InvalidTopology, + (byte)ServiceLevelBand.RecoveringBackup => ServiceLevelBand.RecoveringBackup, + (byte)ServiceLevelBand.BackupMidApply => ServiceLevelBand.BackupMidApply, + (byte)ServiceLevelBand.IsolatedBackup => ServiceLevelBand.IsolatedBackup, + (byte)ServiceLevelBand.AuthoritativeBackup => ServiceLevelBand.AuthoritativeBackup, + (byte)ServiceLevelBand.RecoveringPrimary => ServiceLevelBand.RecoveringPrimary, + (byte)ServiceLevelBand.PrimaryMidApply => ServiceLevelBand.PrimaryMidApply, + (byte)ServiceLevelBand.IsolatedPrimary => ServiceLevelBand.IsolatedPrimary, + (byte)ServiceLevelBand.AuthoritativePrimary => ServiceLevelBand.AuthoritativePrimary, + _ => ServiceLevelBand.Unknown, + }; +} + +/// +/// Named bands of the 8-state ServiceLevel matrix. Numeric values match the +/// table exactly; any drift will be caught by the +/// Phase 6.3 compliance script. +/// +public enum ServiceLevelBand : byte +{ + /// Operator-declared maintenance. Reserved per OPC UA Part 5 §6.3.34. + Maintenance = 0, + + /// Unreachable / Faulted. Reserved per OPC UA Part 5 §6.3.34. + NoData = 1, + + /// Detected-inconsistency band — >1 Primary observed runtime; both nodes self-demote. + InvalidTopology = 2, + + /// Backup post-fault, dwell not met. + RecoveringBackup = 30, + + /// Backup inside a publish-apply window. + BackupMidApply = 50, + + /// Backup with unreachable Primary — "take over if asked"; does NOT auto-promote. + IsolatedBackup = 80, + + /// Backup nominal operation. + AuthoritativeBackup = 100, + + /// Primary post-fault, dwell not met. + RecoveringPrimary = 180, + + /// Primary inside a publish-apply window. + PrimaryMidApply = 200, + + /// Primary with unreachable peer, self serving — retains authority. + IsolatedPrimary = 230, + + /// Primary nominal operation. + AuthoritativePrimary = 255, + + /// Sentinel for unrecognised byte values. + Unknown = 254, +} diff --git a/tests/ZB.MOM.WW.OtOpcUa.Server.Tests/ApplyLeaseRegistryTests.cs b/tests/ZB.MOM.WW.OtOpcUa.Server.Tests/ApplyLeaseRegistryTests.cs new file mode 100644 index 0000000..81d70b7 --- /dev/null +++ b/tests/ZB.MOM.WW.OtOpcUa.Server.Tests/ApplyLeaseRegistryTests.cs @@ -0,0 +1,118 @@ +using Shouldly; +using Xunit; +using ZB.MOM.WW.OtOpcUa.Server.Redundancy; + +namespace ZB.MOM.WW.OtOpcUa.Server.Tests; + +[Trait("Category", "Unit")] +public sealed class ApplyLeaseRegistryTests +{ + private static readonly DateTime T0 = new(2026, 4, 19, 12, 0, 0, DateTimeKind.Utc); + + private sealed class FakeTimeProvider : TimeProvider + { + public DateTime Utc { get; set; } = T0; + public override DateTimeOffset GetUtcNow() => new(Utc, TimeSpan.Zero); + } + + [Fact] + public async Task EmptyRegistry_NotInProgress() + { + var reg = new ApplyLeaseRegistry(); + reg.IsApplyInProgress.ShouldBeFalse(); + await Task.Yield(); + } + + [Fact] + public async Task BeginAndDispose_ClosesLease() + { + var reg = new ApplyLeaseRegistry(); + + await using (reg.BeginApplyLease(1, Guid.NewGuid())) + { + reg.IsApplyInProgress.ShouldBeTrue(); + reg.OpenLeaseCount.ShouldBe(1); + } + + reg.IsApplyInProgress.ShouldBeFalse(); + } + + [Fact] + public async Task Dispose_OnException_StillCloses() + { + var reg = new ApplyLeaseRegistry(); + var publishId = Guid.NewGuid(); + + await Should.ThrowAsync(async () => + { + await using var lease = reg.BeginApplyLease(1, publishId); + throw new InvalidOperationException("publish failed"); + }); + + reg.IsApplyInProgress.ShouldBeFalse("await-using semantics must close the lease on exception"); + } + + [Fact] + public async Task Dispose_TwiceIsSafe() + { + var reg = new ApplyLeaseRegistry(); + var lease = reg.BeginApplyLease(1, Guid.NewGuid()); + + await lease.DisposeAsync(); + await lease.DisposeAsync(); + + reg.IsApplyInProgress.ShouldBeFalse(); + } + + [Fact] + public async Task MultipleLeases_Concurrent_StayIsolated() + { + var reg = new ApplyLeaseRegistry(); + var id1 = Guid.NewGuid(); + var id2 = Guid.NewGuid(); + + await using var lease1 = reg.BeginApplyLease(1, id1); + await using var lease2 = reg.BeginApplyLease(2, id2); + + reg.OpenLeaseCount.ShouldBe(2); + await lease1.DisposeAsync(); + reg.IsApplyInProgress.ShouldBeTrue("lease2 still open"); + await lease2.DisposeAsync(); + reg.IsApplyInProgress.ShouldBeFalse(); + } + + [Fact] + public async Task Watchdog_ClosesStaleLeases() + { + var clock = new FakeTimeProvider(); + var reg = new ApplyLeaseRegistry(applyMaxDuration: TimeSpan.FromMinutes(10), timeProvider: clock); + + _ = reg.BeginApplyLease(1, Guid.NewGuid()); // intentional leak; not awaited / disposed + + // Lease still young → no-op. + clock.Utc = T0.AddMinutes(5); + reg.PruneStale().ShouldBe(0); + reg.IsApplyInProgress.ShouldBeTrue(); + + // Past the watchdog horizon → force-close. + clock.Utc = T0.AddMinutes(11); + var closed = reg.PruneStale(); + + closed.ShouldBe(1); + reg.IsApplyInProgress.ShouldBeFalse("ServiceLevel can't stick at mid-apply after a crashed publisher"); + await Task.Yield(); + } + + [Fact] + public async Task Watchdog_LeavesRecentLeaseAlone() + { + var clock = new FakeTimeProvider(); + var reg = new ApplyLeaseRegistry(applyMaxDuration: TimeSpan.FromMinutes(10), timeProvider: clock); + + await using var lease = reg.BeginApplyLease(1, Guid.NewGuid()); + clock.Utc = T0.AddMinutes(3); + + reg.PruneStale().ShouldBe(0); + reg.IsApplyInProgress.ShouldBeTrue(); + } +} diff --git a/tests/ZB.MOM.WW.OtOpcUa.Server.Tests/RecoveryStateManagerTests.cs b/tests/ZB.MOM.WW.OtOpcUa.Server.Tests/RecoveryStateManagerTests.cs new file mode 100644 index 0000000..ce90d8b --- /dev/null +++ b/tests/ZB.MOM.WW.OtOpcUa.Server.Tests/RecoveryStateManagerTests.cs @@ -0,0 +1,92 @@ +using Shouldly; +using Xunit; +using ZB.MOM.WW.OtOpcUa.Server.Redundancy; + +namespace ZB.MOM.WW.OtOpcUa.Server.Tests; + +[Trait("Category", "Unit")] +public sealed class RecoveryStateManagerTests +{ + private static readonly DateTime T0 = new(2026, 4, 19, 12, 0, 0, DateTimeKind.Utc); + + private sealed class FakeTimeProvider : TimeProvider + { + public DateTime Utc { get; set; } = T0; + public override DateTimeOffset GetUtcNow() => new(Utc, TimeSpan.Zero); + } + + [Fact] + public void NeverFaulted_DwellIsAutomaticallyMet() + { + var mgr = new RecoveryStateManager(); + mgr.IsDwellMet().ShouldBeTrue(); + } + + [Fact] + public void AfterFault_Only_IsDwellMet_Returns_True_ButCallerDoesntQueryDuringFaulted() + { + // Documented semantics: IsDwellMet is only consulted when selfHealthy=true (i.e. the + // node has recovered into Healthy). During Faulted the coordinator short-circuits on + // the self-health check and never calls IsDwellMet. So returning true here is harmless; + // the test captures the intent so a future "return false during Faulted" tweak has to + // deliberately change this test first. + var mgr = new RecoveryStateManager(); + mgr.MarkFaulted(); + mgr.IsDwellMet().ShouldBeTrue(); + } + + [Fact] + public void AfterRecovery_NoWitness_DwellNotMet_EvenAfterElapsed() + { + var clock = new FakeTimeProvider(); + var mgr = new RecoveryStateManager(dwellTime: TimeSpan.FromSeconds(60), timeProvider: clock); + mgr.MarkFaulted(); + mgr.MarkRecovered(); + clock.Utc = T0.AddSeconds(120); + + mgr.IsDwellMet().ShouldBeFalse("dwell elapsed but no publish witness — must NOT escape Recovering band"); + } + + [Fact] + public void AfterRecovery_WitnessButTooSoon_DwellNotMet() + { + var clock = new FakeTimeProvider(); + var mgr = new RecoveryStateManager(dwellTime: TimeSpan.FromSeconds(60), timeProvider: clock); + mgr.MarkFaulted(); + mgr.MarkRecovered(); + mgr.RecordPublishWitness(); + clock.Utc = T0.AddSeconds(30); + + mgr.IsDwellMet().ShouldBeFalse("witness ok but dwell 30s < 60s"); + } + + [Fact] + public void AfterRecovery_Witness_And_DwellElapsed_Met() + { + var clock = new FakeTimeProvider(); + var mgr = new RecoveryStateManager(dwellTime: TimeSpan.FromSeconds(60), timeProvider: clock); + mgr.MarkFaulted(); + mgr.MarkRecovered(); + mgr.RecordPublishWitness(); + clock.Utc = T0.AddSeconds(61); + + mgr.IsDwellMet().ShouldBeTrue(); + } + + [Fact] + public void ReFault_ResetsWitness_AndDwellClock() + { + var clock = new FakeTimeProvider(); + var mgr = new RecoveryStateManager(dwellTime: TimeSpan.FromSeconds(60), timeProvider: clock); + mgr.MarkFaulted(); + mgr.MarkRecovered(); + mgr.RecordPublishWitness(); + clock.Utc = T0.AddSeconds(61); + mgr.IsDwellMet().ShouldBeTrue(); + + mgr.MarkFaulted(); + mgr.MarkRecovered(); + clock.Utc = T0.AddSeconds(100); // re-entered Recovering, no new witness + mgr.IsDwellMet().ShouldBeFalse("new recovery needs its own witness"); + } +} diff --git a/tests/ZB.MOM.WW.OtOpcUa.Server.Tests/ServiceLevelCalculatorTests.cs b/tests/ZB.MOM.WW.OtOpcUa.Server.Tests/ServiceLevelCalculatorTests.cs new file mode 100644 index 0000000..34a5d21 --- /dev/null +++ b/tests/ZB.MOM.WW.OtOpcUa.Server.Tests/ServiceLevelCalculatorTests.cs @@ -0,0 +1,217 @@ +using Shouldly; +using Xunit; +using ZB.MOM.WW.OtOpcUa.Configuration.Enums; +using ZB.MOM.WW.OtOpcUa.Server.Redundancy; + +namespace ZB.MOM.WW.OtOpcUa.Server.Tests; + +[Trait("Category", "Unit")] +public sealed class ServiceLevelCalculatorTests +{ + // --- Reserved bands (0, 1, 2) --- + + [Fact] + public void OperatorMaintenance_Overrides_Everything() + { + var v = ServiceLevelCalculator.Compute( + RedundancyRole.Primary, + selfHealthy: true, peerUaHealthy: true, peerHttpHealthy: true, + applyInProgress: false, recoveryDwellMet: true, topologyValid: true, + operatorMaintenance: true); + + v.ShouldBe((byte)ServiceLevelBand.Maintenance); + } + + [Fact] + public void UnhealthySelf_ReturnsNoData() + { + var v = ServiceLevelCalculator.Compute( + RedundancyRole.Primary, + selfHealthy: false, peerUaHealthy: true, peerHttpHealthy: true, + applyInProgress: false, recoveryDwellMet: true, topologyValid: true); + + v.ShouldBe((byte)ServiceLevelBand.NoData); + } + + [Fact] + public void InvalidTopology_Demotes_BothNodes_To_2() + { + var primary = ServiceLevelCalculator.Compute( + RedundancyRole.Primary, + selfHealthy: true, peerUaHealthy: true, peerHttpHealthy: true, + applyInProgress: false, recoveryDwellMet: true, topologyValid: false); + var secondary = ServiceLevelCalculator.Compute( + RedundancyRole.Secondary, + selfHealthy: true, peerUaHealthy: true, peerHttpHealthy: true, + applyInProgress: false, recoveryDwellMet: true, topologyValid: false); + + primary.ShouldBe((byte)ServiceLevelBand.InvalidTopology); + secondary.ShouldBe((byte)ServiceLevelBand.InvalidTopology); + } + + // --- Operational bands (authoritative) --- + + [Fact] + public void Authoritative_Primary_Is_255() + { + var v = ServiceLevelCalculator.Compute( + RedundancyRole.Primary, + selfHealthy: true, peerUaHealthy: true, peerHttpHealthy: true, + applyInProgress: false, recoveryDwellMet: true, topologyValid: true); + + v.ShouldBe((byte)ServiceLevelBand.AuthoritativePrimary); + v.ShouldBe((byte)255); + } + + [Fact] + public void Authoritative_Backup_Is_100() + { + var v = ServiceLevelCalculator.Compute( + RedundancyRole.Secondary, + selfHealthy: true, peerUaHealthy: true, peerHttpHealthy: true, + applyInProgress: false, recoveryDwellMet: true, topologyValid: true); + + v.ShouldBe((byte)100); + } + + // --- Isolated bands --- + + [Fact] + public void IsolatedPrimary_PeerUnreachable_Is_230_RetainsAuthority() + { + var v = ServiceLevelCalculator.Compute( + RedundancyRole.Primary, + selfHealthy: true, peerUaHealthy: false, peerHttpHealthy: true, + applyInProgress: false, recoveryDwellMet: true, topologyValid: true); + + v.ShouldBe((byte)230); + } + + [Fact] + public void IsolatedBackup_PrimaryUnreachable_Is_80_DoesNotPromote() + { + var v = ServiceLevelCalculator.Compute( + RedundancyRole.Secondary, + selfHealthy: true, peerUaHealthy: false, peerHttpHealthy: false, + applyInProgress: false, recoveryDwellMet: true, topologyValid: true); + + v.ShouldBe((byte)80, "Backup isolates at 80 — doesn't auto-promote to 255"); + } + + [Fact] + public void HttpOnly_Unreachable_TriggersIsolated() + { + // Either probe failing marks peer unreachable — UA probe is authoritative but HTTP is + // the fast-fail short-circuit; either missing means "not a valid peer right now". + var v = ServiceLevelCalculator.Compute( + RedundancyRole.Primary, + selfHealthy: true, peerUaHealthy: true, peerHttpHealthy: false, + applyInProgress: false, recoveryDwellMet: true, topologyValid: true); + + v.ShouldBe((byte)230); + } + + // --- Apply-mid bands --- + + [Fact] + public void PrimaryMidApply_Is_200() + { + var v = ServiceLevelCalculator.Compute( + RedundancyRole.Primary, + selfHealthy: true, peerUaHealthy: true, peerHttpHealthy: true, + applyInProgress: true, recoveryDwellMet: true, topologyValid: true); + + v.ShouldBe((byte)200); + } + + [Fact] + public void BackupMidApply_Is_50() + { + var v = ServiceLevelCalculator.Compute( + RedundancyRole.Secondary, + selfHealthy: true, peerUaHealthy: true, peerHttpHealthy: true, + applyInProgress: true, recoveryDwellMet: true, topologyValid: true); + + v.ShouldBe((byte)50); + } + + [Fact] + public void ApplyInProgress_Dominates_PeerUnreachable() + { + // Per Stream C.4 integration-test expectation: mid-apply + peer down → apply wins (200). + var v = ServiceLevelCalculator.Compute( + RedundancyRole.Primary, + selfHealthy: true, peerUaHealthy: false, peerHttpHealthy: false, + applyInProgress: true, recoveryDwellMet: true, topologyValid: true); + + v.ShouldBe((byte)200); + } + + // --- Recovering bands --- + + [Fact] + public void RecoveringPrimary_Is_180() + { + var v = ServiceLevelCalculator.Compute( + RedundancyRole.Primary, + selfHealthy: true, peerUaHealthy: true, peerHttpHealthy: true, + applyInProgress: false, recoveryDwellMet: false, topologyValid: true); + + v.ShouldBe((byte)180); + } + + [Fact] + public void RecoveringBackup_Is_30() + { + var v = ServiceLevelCalculator.Compute( + RedundancyRole.Secondary, + selfHealthy: true, peerUaHealthy: true, peerHttpHealthy: true, + applyInProgress: false, recoveryDwellMet: false, topologyValid: true); + + v.ShouldBe((byte)30); + } + + // --- Standalone node (no peer) --- + + [Fact] + public void Standalone_IsAuthoritativePrimary_WhenHealthy() + { + var v = ServiceLevelCalculator.Compute( + RedundancyRole.Standalone, + selfHealthy: true, peerUaHealthy: false, peerHttpHealthy: false, + applyInProgress: false, recoveryDwellMet: true, topologyValid: true); + + v.ShouldBe((byte)255, "Standalone has no peer — treat healthy as authoritative"); + } + + [Fact] + public void Standalone_MidApply_Is_200() + { + var v = ServiceLevelCalculator.Compute( + RedundancyRole.Standalone, + selfHealthy: true, peerUaHealthy: false, peerHttpHealthy: false, + applyInProgress: true, recoveryDwellMet: true, topologyValid: true); + + v.ShouldBe((byte)200); + } + + // --- Classify round-trip --- + + [Theory] + [InlineData((byte)0, ServiceLevelBand.Maintenance)] + [InlineData((byte)1, ServiceLevelBand.NoData)] + [InlineData((byte)2, ServiceLevelBand.InvalidTopology)] + [InlineData((byte)30, ServiceLevelBand.RecoveringBackup)] + [InlineData((byte)50, ServiceLevelBand.BackupMidApply)] + [InlineData((byte)80, ServiceLevelBand.IsolatedBackup)] + [InlineData((byte)100, ServiceLevelBand.AuthoritativeBackup)] + [InlineData((byte)180, ServiceLevelBand.RecoveringPrimary)] + [InlineData((byte)200, ServiceLevelBand.PrimaryMidApply)] + [InlineData((byte)230, ServiceLevelBand.IsolatedPrimary)] + [InlineData((byte)255, ServiceLevelBand.AuthoritativePrimary)] + [InlineData((byte)123, ServiceLevelBand.Unknown)] + public void Classify_RoundTrips_EveryBand(byte value, ServiceLevelBand expected) + { + ServiceLevelCalculator.Classify(value).ShouldBe(expected); + } +}