diff --git a/src/ZB.MOM.WW.OtOpcUa.Driver.FOCAS/Supervisor/Backoff.cs b/src/ZB.MOM.WW.OtOpcUa.Driver.FOCAS/Supervisor/Backoff.cs new file mode 100644 index 0000000..8ffa193 --- /dev/null +++ b/src/ZB.MOM.WW.OtOpcUa.Driver.FOCAS/Supervisor/Backoff.cs @@ -0,0 +1,30 @@ +namespace ZB.MOM.WW.OtOpcUa.Driver.FOCAS.Supervisor; + +/// +/// Respawn-with-backoff schedule for the FOCAS Host process. Matches Galaxy Tier-C: +/// 5s → 15s → 60s cap. A sustained stable run (default 2 min) resets the index so a +/// one-off crash after hours of steady-state doesn't start from the top of the ladder. +/// +public sealed class Backoff +{ + public static TimeSpan[] DefaultSequence { get; } = + [TimeSpan.FromSeconds(5), TimeSpan.FromSeconds(15), TimeSpan.FromSeconds(60)]; + + public TimeSpan StableRunThreshold { get; init; } = TimeSpan.FromMinutes(2); + + private readonly TimeSpan[] _sequence; + private int _index; + + public Backoff(TimeSpan[]? sequence = null) => _sequence = sequence ?? DefaultSequence; + + public TimeSpan Next() + { + var delay = _sequence[Math.Min(_index, _sequence.Length - 1)]; + _index++; + return delay; + } + + public void RecordStableRun() => _index = 0; + + public int AttemptIndex => _index; +} diff --git a/src/ZB.MOM.WW.OtOpcUa.Driver.FOCAS/Supervisor/CircuitBreaker.cs b/src/ZB.MOM.WW.OtOpcUa.Driver.FOCAS/Supervisor/CircuitBreaker.cs new file mode 100644 index 0000000..eeb1a7b --- /dev/null +++ b/src/ZB.MOM.WW.OtOpcUa.Driver.FOCAS/Supervisor/CircuitBreaker.cs @@ -0,0 +1,69 @@ +namespace ZB.MOM.WW.OtOpcUa.Driver.FOCAS.Supervisor; + +/// +/// Crash-loop circuit breaker for the FOCAS Host. Matches Galaxy Tier-C defaults: +/// 3 crashes within 5 minutes opens the breaker; cooldown escalates 1h → 4h → manual +/// reset. A sticky alert stays live until the operator explicitly clears it so +/// recurring crashes can't silently burn through the cooldown ladder overnight. +/// +public sealed class CircuitBreaker +{ + public int CrashesAllowedPerWindow { get; init; } = 3; + public TimeSpan Window { get; init; } = TimeSpan.FromMinutes(5); + + public TimeSpan[] CooldownEscalation { get; init; } = + [TimeSpan.FromHours(1), TimeSpan.FromHours(4), TimeSpan.MaxValue]; + + private readonly List _crashesUtc = []; + private DateTime? _openSinceUtc; + private int _escalationLevel; + public bool StickyAlertActive { get; private set; } + + /// + /// Records a crash + returns true if the supervisor may respawn. On + /// false, is how long to wait before + /// trying again (TimeSpan.MaxValue means manual reset required). + /// + public bool TryRecordCrash(DateTime utcNow, out TimeSpan cooldownRemaining) + { + if (_openSinceUtc is { } openedAt) + { + var cooldown = CooldownEscalation[Math.Min(_escalationLevel, CooldownEscalation.Length - 1)]; + if (cooldown == TimeSpan.MaxValue) + { + cooldownRemaining = TimeSpan.MaxValue; + return false; + } + if (utcNow - openedAt < cooldown) + { + cooldownRemaining = cooldown - (utcNow - openedAt); + return false; + } + + _openSinceUtc = null; + _escalationLevel++; + } + + _crashesUtc.RemoveAll(t => utcNow - t > Window); + _crashesUtc.Add(utcNow); + + if (_crashesUtc.Count > CrashesAllowedPerWindow) + { + _openSinceUtc = utcNow; + StickyAlertActive = true; + cooldownRemaining = CooldownEscalation[Math.Min(_escalationLevel, CooldownEscalation.Length - 1)]; + return false; + } + + cooldownRemaining = TimeSpan.Zero; + return true; + } + + public void ManualReset() + { + _crashesUtc.Clear(); + _openSinceUtc = null; + _escalationLevel = 0; + StickyAlertActive = false; + } +} diff --git a/src/ZB.MOM.WW.OtOpcUa.Driver.FOCAS/Supervisor/FocasHostSupervisor.cs b/src/ZB.MOM.WW.OtOpcUa.Driver.FOCAS/Supervisor/FocasHostSupervisor.cs new file mode 100644 index 0000000..fc58d38 --- /dev/null +++ b/src/ZB.MOM.WW.OtOpcUa.Driver.FOCAS/Supervisor/FocasHostSupervisor.cs @@ -0,0 +1,159 @@ +namespace ZB.MOM.WW.OtOpcUa.Driver.FOCAS.Supervisor; + +/// +/// Ties + + +/// + into one object the +/// driver asks for IFocasClients. On a detected crash (process exit or +/// heartbeat loss) the supervisor fans out BadCommunicationError to all +/// subscribers via the callback, then respawns with +/// backoff unless the breaker is open. +/// +/// +/// The supervisor itself is I/O-free — it doesn't know how to spawn processes, probe +/// pipes, or send heartbeats. Production wires the concrete +/// over FocasIpcClient + Process; +/// tests drive the same state machine with a deterministic launcher stub. +/// +public sealed class FocasHostSupervisor : IDisposable +{ + private readonly IHostProcessLauncher _launcher; + private readonly Backoff _backoff; + private readonly CircuitBreaker _breaker; + private readonly Func _clock; + private IFocasClient? _current; + private DateTime _currentStartedUtc; + private bool _disposed; + + public FocasHostSupervisor( + IHostProcessLauncher launcher, + Backoff? backoff = null, + CircuitBreaker? breaker = null, + Func? clock = null) + { + _launcher = launcher ?? throw new ArgumentNullException(nameof(launcher)); + _backoff = backoff ?? new Backoff(); + _breaker = breaker ?? new CircuitBreaker(); + _clock = clock ?? (() => DateTime.UtcNow); + } + + /// Raised with a short reason string whenever the Host goes unavailable (crash / heartbeat loss / breaker-open). + public event Action? OnUnavailable; + + /// Crash count observed in the current process lifetime. Exposed for /hosts Admin telemetry. + public int ObservedCrashes { get; private set; } + + /// true if the crash-loop breaker has latched a sticky alert that needs operator reset. + public bool StickyAlertActive => _breaker.StickyAlertActive; + + public int BackoffAttempt => _backoff.AttemptIndex; + + /// + /// Returns the current live client. If none, tries to launch — applying the + /// backoff schedule between attempts and stopping once the breaker opens. + /// + public async Task GetOrLaunchAsync(CancellationToken ct) + { + ThrowIfDisposed(); + if (_current is not null && _launcher.IsProcessAlive) return _current; + + return await LaunchWithBackoffAsync(ct).ConfigureAwait(false); + } + + /// + /// Called by the heartbeat task each time a miss threshold is crossed. + /// Treated as a crash: fan out Bad status + attempt respawn. + /// + public async Task NotifyHostDeadAsync(string reason, CancellationToken ct) + { + ThrowIfDisposed(); + OnUnavailable?.Invoke(reason); + ObservedCrashes++; + try { await _launcher.TerminateAsync(ct).ConfigureAwait(false); } + catch { /* best effort */ } + _current?.Dispose(); + _current = null; + + if (!_breaker.TryRecordCrash(_clock(), out var cooldown)) + { + OnUnavailable?.Invoke(cooldown == TimeSpan.MaxValue + ? "circuit-breaker-open-manual-reset-required" + : $"circuit-breaker-open-cooldown-{cooldown:g}"); + return; + } + // Successful crash recording — do not respawn synchronously; GetOrLaunchAsync will + // pick up the attempt on the next call. Keeps the fan-out fast. + } + + /// Operator action — clear the sticky alert + reset the breaker. + public void AcknowledgeAndReset() + { + _breaker.ManualReset(); + _backoff.RecordStableRun(); + } + + private async Task LaunchWithBackoffAsync(CancellationToken ct) + { + while (true) + { + if (_breaker.StickyAlertActive) + { + if (!_breaker.TryRecordCrash(_clock(), out var cooldown) && cooldown == TimeSpan.MaxValue) + throw new InvalidOperationException( + "FOCAS Host circuit breaker is open and awaiting manual reset. " + + "See Admin /hosts; call AcknowledgeAndReset after investigating the Host log."); + } + + try + { + _current = await _launcher.LaunchAsync(ct).ConfigureAwait(false); + _currentStartedUtc = _clock(); + + // If the launch sequence itself takes long enough to count as a stable run, + // reset the backoff ladder immediately. + if (_clock() - _currentStartedUtc >= _backoff.StableRunThreshold) + _backoff.RecordStableRun(); + + return _current; + } + catch (Exception ex) when (ex is not OperationCanceledException) + { + OnUnavailable?.Invoke($"launch-failed: {ex.Message}"); + ObservedCrashes++; + if (!_breaker.TryRecordCrash(_clock(), out var cooldown)) + { + var hint = cooldown == TimeSpan.MaxValue + ? "manual reset required" + : $"cooldown {cooldown:g}"; + throw new InvalidOperationException( + $"FOCAS Host circuit breaker opened after {ObservedCrashes} crashes — {hint}.", ex); + } + + var delay = _backoff.Next(); + await Task.Delay(delay, ct).ConfigureAwait(false); + } + } + } + + /// Called from the heartbeat loop after a successful ack run — relaxes the backoff ladder. + public void NotifyStableRun() + { + if (_current is null) return; + if (_clock() - _currentStartedUtc >= _backoff.StableRunThreshold) + _backoff.RecordStableRun(); + } + + public void Dispose() + { + if (_disposed) return; + _disposed = true; + try { _launcher.TerminateAsync(CancellationToken.None).GetAwaiter().GetResult(); } + catch { /* best effort */ } + _current?.Dispose(); + _current = null; + } + + private void ThrowIfDisposed() + { + if (_disposed) throw new ObjectDisposedException(nameof(FocasHostSupervisor)); + } +} diff --git a/src/ZB.MOM.WW.OtOpcUa.Driver.FOCAS/Supervisor/HeartbeatMonitor.cs b/src/ZB.MOM.WW.OtOpcUa.Driver.FOCAS/Supervisor/HeartbeatMonitor.cs new file mode 100644 index 0000000..5b43309 --- /dev/null +++ b/src/ZB.MOM.WW.OtOpcUa.Driver.FOCAS/Supervisor/HeartbeatMonitor.cs @@ -0,0 +1,29 @@ +namespace ZB.MOM.WW.OtOpcUa.Driver.FOCAS.Supervisor; + +/// +/// Tracks missed heartbeats from the FOCAS Host. 2s cadence + 3 consecutive misses = +/// host declared dead (~6s detection). Same defaults as Galaxy Tier-C so operators +/// see the same cadence across hosts on the /hosts Admin page. +/// +public sealed class HeartbeatMonitor +{ + public int MissesUntilDead { get; init; } = 3; + + public TimeSpan Cadence { get; init; } = TimeSpan.FromSeconds(2); + + public int ConsecutiveMisses { get; private set; } + public DateTime? LastAckUtc { get; private set; } + + public void RecordAck(DateTime utcNow) + { + ConsecutiveMisses = 0; + LastAckUtc = utcNow; + } + + /// Records a missed heartbeat; returns true when the death threshold is crossed. + public bool RecordMiss() + { + ConsecutiveMisses++; + return ConsecutiveMisses >= MissesUntilDead; + } +} diff --git a/src/ZB.MOM.WW.OtOpcUa.Driver.FOCAS/Supervisor/IHostProcessLauncher.cs b/src/ZB.MOM.WW.OtOpcUa.Driver.FOCAS/Supervisor/IHostProcessLauncher.cs new file mode 100644 index 0000000..50c4024 --- /dev/null +++ b/src/ZB.MOM.WW.OtOpcUa.Driver.FOCAS/Supervisor/IHostProcessLauncher.cs @@ -0,0 +1,32 @@ +namespace ZB.MOM.WW.OtOpcUa.Driver.FOCAS.Supervisor; + +/// +/// Abstraction over the act of spawning a FOCAS Host process and obtaining an +/// connected to it. Production wires this to a real +/// Process.Start + FocasIpcClient.ConnectAsync; tests use a fake that +/// exposes deterministic failure modes so the supervisor logic can be stressed +/// without spawning actual exes. +/// +public interface IHostProcessLauncher +{ + /// + /// Spawn a new Host process (if one isn't already running) and return a live + /// client session. Throws on unrecoverable errors; transient errors (e.g. Host + /// not ready yet) should throw so the supervisor + /// applies the backoff ladder. + /// + Task LaunchAsync(CancellationToken ct); + + /// + /// Terminate the Host process if one is running. Called on Dispose and after a + /// heartbeat loss is detected. + /// + Task TerminateAsync(CancellationToken ct); + + /// + /// true when the most recently spawned Host process is still alive. + /// Supervisor polls this at heartbeat cadence; going false without a + /// clean shutdown counts as a crash. + /// + bool IsProcessAlive { get; } +} diff --git a/tests/ZB.MOM.WW.OtOpcUa.Driver.FOCAS.Tests/SupervisorTests.cs b/tests/ZB.MOM.WW.OtOpcUa.Driver.FOCAS.Tests/SupervisorTests.cs new file mode 100644 index 0000000..9694e7f --- /dev/null +++ b/tests/ZB.MOM.WW.OtOpcUa.Driver.FOCAS.Tests/SupervisorTests.cs @@ -0,0 +1,249 @@ +using Shouldly; +using Xunit; +using ZB.MOM.WW.OtOpcUa.Driver.FOCAS.Supervisor; + +namespace ZB.MOM.WW.OtOpcUa.Driver.FOCAS.Tests; + +[Trait("Category", "Unit")] +public sealed class BackoffTests +{ + [Fact] + public void Default_sequence_is_5s_15s_60s_then_clamped() + { + var b = new Backoff(); + b.Next().ShouldBe(TimeSpan.FromSeconds(5)); + b.Next().ShouldBe(TimeSpan.FromSeconds(15)); + b.Next().ShouldBe(TimeSpan.FromSeconds(60)); + b.Next().ShouldBe(TimeSpan.FromSeconds(60)); + b.Next().ShouldBe(TimeSpan.FromSeconds(60)); + } + + [Fact] + public void RecordStableRun_resets_the_ladder_to_the_start() + { + var b = new Backoff(); + b.Next(); b.Next(); + b.AttemptIndex.ShouldBe(2); + b.RecordStableRun(); + b.AttemptIndex.ShouldBe(0); + b.Next().ShouldBe(TimeSpan.FromSeconds(5)); + } +} + +[Trait("Category", "Unit")] +public sealed class CircuitBreakerTests +{ + [Fact] + public void Allows_crashes_below_threshold() + { + var b = new CircuitBreaker(); + var now = DateTime.UtcNow; + b.TryRecordCrash(now, out _).ShouldBeTrue(); + b.TryRecordCrash(now.AddSeconds(1), out _).ShouldBeTrue(); + b.TryRecordCrash(now.AddSeconds(2), out _).ShouldBeTrue(); + b.StickyAlertActive.ShouldBeFalse(); + } + + [Fact] + public void Opens_when_exceeding_threshold_in_window() + { + var b = new CircuitBreaker(); + var now = DateTime.UtcNow; + b.TryRecordCrash(now, out _); + b.TryRecordCrash(now.AddSeconds(1), out _); + b.TryRecordCrash(now.AddSeconds(2), out _); + b.TryRecordCrash(now.AddSeconds(3), out var cooldown).ShouldBeFalse(); + cooldown.ShouldBe(TimeSpan.FromHours(1)); + b.StickyAlertActive.ShouldBeTrue(); + } + + [Fact] + public void Escalates_cooldown_after_second_open() + { + var b = new CircuitBreaker(); + var t0 = new DateTime(2026, 1, 1, 0, 0, 0, DateTimeKind.Utc); + // First burst — 4 crashes opens breaker with 1h cooldown. + for (var i = 0; i < 4; i++) b.TryRecordCrash(t0.AddSeconds(i), out _); + b.StickyAlertActive.ShouldBeTrue(); + + // Wait past cooldown. The first crash after cooldown-elapsed resets _openSinceUtc and + // bumps escalation level; the next 3 crashes then re-open with the escalated 4h cooldown. + b.TryRecordCrash(t0.AddHours(1).AddMinutes(1), out _); + var t1 = t0.AddHours(1).AddMinutes(1).AddSeconds(1); + b.TryRecordCrash(t1, out _); + b.TryRecordCrash(t1.AddSeconds(1), out _); + b.TryRecordCrash(t1.AddSeconds(2), out var cooldown).ShouldBeFalse(); + cooldown.ShouldBe(TimeSpan.FromHours(4)); + } + + [Fact] + public void ManualReset_clears_everything() + { + var b = new CircuitBreaker(); + var now = DateTime.UtcNow; + for (var i = 0; i < 5; i++) b.TryRecordCrash(now.AddSeconds(i), out _); + b.StickyAlertActive.ShouldBeTrue(); + b.ManualReset(); + b.StickyAlertActive.ShouldBeFalse(); + b.TryRecordCrash(now.AddSeconds(10), out _).ShouldBeTrue(); + } +} + +[Trait("Category", "Unit")] +public sealed class HeartbeatMonitorTests +{ + [Fact] + public void Three_consecutive_misses_declares_dead() + { + var m = new HeartbeatMonitor(); + m.RecordMiss().ShouldBeFalse(); + m.RecordMiss().ShouldBeFalse(); + m.RecordMiss().ShouldBeTrue(); + } + + [Fact] + public void Ack_resets_the_miss_counter() + { + var m = new HeartbeatMonitor(); + m.RecordMiss(); m.RecordMiss(); + m.ConsecutiveMisses.ShouldBe(2); + m.RecordAck(DateTime.UtcNow); + m.ConsecutiveMisses.ShouldBe(0); + } +} + +[Trait("Category", "Unit")] +public sealed class FocasHostSupervisorTests +{ + private sealed class FakeLauncher : IHostProcessLauncher + { + public int LaunchAttempts { get; private set; } + public int Terminations { get; private set; } + public Queue> Plan { get; } = new(); + public bool IsProcessAlive { get; set; } + + public Task LaunchAsync(CancellationToken ct) + { + LaunchAttempts++; + if (Plan.Count == 0) throw new InvalidOperationException("FakeLauncher plan exhausted"); + var next = Plan.Dequeue()(); + IsProcessAlive = true; + return Task.FromResult(next); + } + + public Task TerminateAsync(CancellationToken ct) + { + Terminations++; + IsProcessAlive = false; + return Task.CompletedTask; + } + } + + private sealed class StubFocasClient : IFocasClient + { + public bool IsConnected => true; + public Task ConnectAsync(FocasHostAddress address, TimeSpan timeout, CancellationToken ct) => Task.CompletedTask; + public Task<(object? value, uint status)> ReadAsync(FocasAddress a, FocasDataType t, CancellationToken ct) => + Task.FromResult<(object?, uint)>((0, 0)); + public Task WriteAsync(FocasAddress a, FocasDataType t, object? v, CancellationToken ct) => Task.FromResult(0u); + public Task ProbeAsync(CancellationToken ct) => Task.FromResult(true); + public void Dispose() { } + } + + [Fact] + public async Task GetOrLaunch_returns_client_on_first_success() + { + var launcher = new FakeLauncher(); + launcher.Plan.Enqueue(() => new StubFocasClient()); + var supervisor = new FocasHostSupervisor(launcher); + var client = await supervisor.GetOrLaunchAsync(TestContext.Current.CancellationToken); + client.ShouldNotBeNull(); + launcher.LaunchAttempts.ShouldBe(1); + } + + [Fact] + public async Task GetOrLaunch_retries_after_transient_failure_with_backoff() + { + var launcher = new FakeLauncher(); + launcher.Plan.Enqueue(() => throw new TimeoutException("pipe not ready")); + launcher.Plan.Enqueue(() => new StubFocasClient()); + + var backoff = new Backoff([TimeSpan.FromMilliseconds(10), TimeSpan.FromMilliseconds(20)]); + var supervisor = new FocasHostSupervisor(launcher, backoff); + + var unavailableMessages = new List(); + supervisor.OnUnavailable += m => unavailableMessages.Add(m); + + var client = await supervisor.GetOrLaunchAsync(TestContext.Current.CancellationToken); + client.ShouldNotBeNull(); + launcher.LaunchAttempts.ShouldBe(2); + unavailableMessages.Count.ShouldBe(1); + unavailableMessages[0].ShouldContain("launch-failed"); + } + + [Fact] + public async Task Repeated_launch_failures_open_breaker_and_surface_InvalidOperation() + { + var launcher = new FakeLauncher(); + for (var i = 0; i < 10; i++) + launcher.Plan.Enqueue(() => throw new InvalidOperationException("simulated host refused")); + + var supervisor = new FocasHostSupervisor( + launcher, + backoff: new Backoff([TimeSpan.FromMilliseconds(1)]), + breaker: new CircuitBreaker { CrashesAllowedPerWindow = 2, Window = TimeSpan.FromMinutes(5) }); + + var ex = await Should.ThrowAsync(async () => + await supervisor.GetOrLaunchAsync(TestContext.Current.CancellationToken)); + ex.Message.ShouldContain("circuit breaker"); + supervisor.StickyAlertActive.ShouldBeTrue(); + } + + [Fact] + public async Task NotifyHostDeadAsync_terminates_current_and_fans_out_unavailable() + { + var launcher = new FakeLauncher(); + launcher.Plan.Enqueue(() => new StubFocasClient()); + var supervisor = new FocasHostSupervisor(launcher); + + var messages = new List(); + supervisor.OnUnavailable += m => messages.Add(m); + await supervisor.GetOrLaunchAsync(TestContext.Current.CancellationToken); + + await supervisor.NotifyHostDeadAsync("heartbeat-loss", TestContext.Current.CancellationToken); + + launcher.Terminations.ShouldBe(1); + messages.ShouldContain("heartbeat-loss"); + supervisor.ObservedCrashes.ShouldBe(1); + } + + [Fact] + public async Task AcknowledgeAndReset_clears_sticky_alert() + { + var launcher = new FakeLauncher(); + for (var i = 0; i < 10; i++) + launcher.Plan.Enqueue(() => throw new InvalidOperationException("refused")); + var supervisor = new FocasHostSupervisor( + launcher, + backoff: new Backoff([TimeSpan.FromMilliseconds(1)]), + breaker: new CircuitBreaker { CrashesAllowedPerWindow = 1 }); + + try { await supervisor.GetOrLaunchAsync(TestContext.Current.CancellationToken); } catch { } + supervisor.StickyAlertActive.ShouldBeTrue(); + + supervisor.AcknowledgeAndReset(); + supervisor.StickyAlertActive.ShouldBeFalse(); + } + + [Fact] + public async Task Dispose_terminates_host_process() + { + var launcher = new FakeLauncher(); + launcher.Plan.Enqueue(() => new StubFocasClient()); + var supervisor = new FocasHostSupervisor(launcher); + await supervisor.GetOrLaunchAsync(TestContext.Current.CancellationToken); + + supervisor.Dispose(); + launcher.Terminations.ShouldBe(1); + } +}