From 503360994417c0e1796597af5f0b33a03e61998a Mon Sep 17 00:00:00 2001 From: Joseph Doherty Date: Mon, 20 Apr 2026 14:17:23 -0400 Subject: [PATCH] =?UTF-8?q?FOCAS=20Tier-C=20PR=20D=20=E2=80=94=20superviso?= =?UTF-8?q?r=20+=20backoff=20+=20crash-loop=20breaker=20+=20heartbeat=20mo?= =?UTF-8?q?nitor.=20Fourth=20of=205=20PRs=20for=20#220.=20Ships=20the=20re?= =?UTF-8?q?silience=20harness=20that=20sits=20between=20the=20driver's=20I?= =?UTF-8?q?FocasClient=20requests=20and=20the=20Tier-C=20Host=20process,?= =?UTF-8?q?=20so=20a=20crashing=20Fwlib32.dll=20takes=20down=20only=20the?= =?UTF-8?q?=20Host=20(not=20the=20main=20server),=20gets=20respawned=20on?= =?UTF-8?q?=20a=20backoff=20ladder,=20and=20opens=20a=20circuit=20with=20a?= =?UTF-8?q?=20sticky=20operator=20alert=20when=20the=20crash=20rate=20is?= =?UTF-8?q?=20pathological.=20Same=20shape=20as=20Galaxy=20Tier-C=20so=20t?= =?UTF-8?q?he=20Admin=20/hosts=20surface=20has=20a=20single=20mental=20mod?= =?UTF-8?q?el.=20New=20Supervisor/=20namespace=20in=20Driver.FOCAS=20(.NET?= =?UTF-8?q?=2010,=20Proxy-side):=20Backoff=20with=20the=205s=E2=86=9215s?= =?UTF-8?q?=E2=86=9260s=20default=20ladder=20+=20StableRunThreshold=20that?= =?UTF-8?q?=20resets=20the=20index=20after=20a=202-min=20clean=20run=20(so?= =?UTF-8?q?=20a=20one-off=20crash=20after=20hours=20of=20steady-state=20do?= =?UTF-8?q?esn't=20restart=20from=20the=20top);=20CircuitBreaker=20with=20?= =?UTF-8?q?3-crashes-in-5-min=20threshold=20+=20escalating=201h=E2=86=924h?= =?UTF-8?q?=E2=86=92manual-reset=20cooldown=20ladder=20+=20StickyAlertActi?= =?UTF-8?q?ve=20flag=20that=20persists=20across=20cooldowns=20until=20Ackn?= =?UTF-8?q?owledgeAndReset=20is=20called;=20HeartbeatMonitor=20tracking=20?= =?UTF-8?q?ConsecutiveMisses=20against=20the=203-misses-kill=20threshold?= =?UTF-8?q?=20+=20LastAckUtc=20for=20telemetry;=20IHostProcessLauncher=20a?= =?UTF-8?q?bstraction=20over=20"spawn=20Host=20process=20+=20produce=20an?= =?UTF-8?q?=20IFocasClient=20connected=20to=20it"=20so=20the=20supervisor?= =?UTF-8?q?=20stays=20I/O-free=20and=20fully=20testable=20with=20a=20fake?= =?UTF-8?q?=20launcher=20that=20can=20be=20told=20to=20throw=20on=20specif?= =?UTF-8?q?ic=20attempts=20(production=20wiring=20over=20Process.Start=20+?= =?UTF-8?q?=20FocasIpcClient.ConnectAsync=20is=20the=20PR=20E=20ops-glue?= =?UTF-8?q?=20concern);=20FocasHostSupervisor=20orchestrating=20them=20?= =?UTF-8?q?=E2=80=94=20GetOrLaunchAsync=20cycles=20through=20backoff=20unt?= =?UTF-8?q?il=20either=20a=20client=20is=20returned=20or=20the=20breaker?= =?UTF-8?q?=20opens=20(surfaced=20as=20InvalidOperationException=20so=20th?= =?UTF-8?q?e=20driver=20maps=20to=20BadDeviceFailure),=20NotifyHostDeadAsy?= =?UTF-8?q?nc=20fans=20out=20the=20unavailable=20event=20+=20terminates=20?= =?UTF-8?q?the=20current=20launcher=20+=20records=20the=20crash=20without?= =?UTF-8?q?=20blocking=20(so=20heartbeat-loss=20detection=20can=20short-ci?= =?UTF-8?q?rcuit=20subscriber=20fan-out=20and=20let=20the=20next=20GetOrLa?= =?UTF-8?q?unchAsync=20handle=20the=20respawn),=20AcknowledgeAndReset=20is?= =?UTF-8?q?=20the=20operator-clear=20path,=20OnUnavailable=20event=20for?= =?UTF-8?q?=20Admin=20/hosts=20wiring=20+=20ObservedCrashes=20+=20BackoffA?= =?UTF-8?q?ttempt=20+=20StickyAlertActive=20for=20telemetry.=2014=20new=20?= =?UTF-8?q?unit=20tests=20across=20SupervisorTests.cs:=20Backoff=20(defaul?= =?UTF-8?q?t=20sequence,=20clamping,=20RecordStableRun=20resets),=20Circui?= =?UTF-8?q?tBreaker=20(below=20threshold=20allowed,=20opens=20at=20thresho?= =?UTF-8?q?ld,=20escalates=20cooldown=20after=20second=20open,=20ManualRes?= =?UTF-8?q?et=20clears=20state),=20HeartbeatMonitor=20(3=20consecutive=20m?= =?UTF-8?q?isses=20declares=20dead,=20ack=20resets=20counter),=20FocasHost?= =?UTF-8?q?Supervisor=20(first-launch=20success,=20retry-with-backoff=20af?= =?UTF-8?q?ter=20transient=20failure,=20repeated=20failures=20open=20break?= =?UTF-8?q?er=20+=20surface=20InvalidOperationException,=20NotifyHostDeadA?= =?UTF-8?q?sync=20terminates=20+=20fan-outs=20+=20increments=20crash=20cou?= =?UTF-8?q?nt,=20AcknowledgeAndReset=20clears=20sticky,=20Dispose=20termin?= =?UTF-8?q?ates).=20Full=20FOCAS=20driver=20tests=20now=20186/186=20green?= =?UTF-8?q?=20(172=20+=2014=20new).=20No=20changes=20to=20IFocasClient=20D?= =?UTF-8?q?I=20contract;=20existing=20FakeFocasClient-based=20tests=20unaf?= =?UTF-8?q?fected.=20PR=20E=20wires=20the=20real=20Process-based=20IHostPr?= =?UTF-8?q?ocessLauncher=20+=20NSSM=20install=20scripts=20+=20MMF=20post-m?= =?UTF-8?q?ortem=20+=20docs.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Opus 4.7 (1M context) --- .../Supervisor/Backoff.cs | 30 +++ .../Supervisor/CircuitBreaker.cs | 69 +++++ .../Supervisor/FocasHostSupervisor.cs | 159 +++++++++++ .../Supervisor/HeartbeatMonitor.cs | 29 ++ .../Supervisor/IHostProcessLauncher.cs | 32 +++ .../SupervisorTests.cs | 249 ++++++++++++++++++ 6 files changed, 568 insertions(+) create mode 100644 src/ZB.MOM.WW.OtOpcUa.Driver.FOCAS/Supervisor/Backoff.cs create mode 100644 src/ZB.MOM.WW.OtOpcUa.Driver.FOCAS/Supervisor/CircuitBreaker.cs create mode 100644 src/ZB.MOM.WW.OtOpcUa.Driver.FOCAS/Supervisor/FocasHostSupervisor.cs create mode 100644 src/ZB.MOM.WW.OtOpcUa.Driver.FOCAS/Supervisor/HeartbeatMonitor.cs create mode 100644 src/ZB.MOM.WW.OtOpcUa.Driver.FOCAS/Supervisor/IHostProcessLauncher.cs create mode 100644 tests/ZB.MOM.WW.OtOpcUa.Driver.FOCAS.Tests/SupervisorTests.cs diff --git a/src/ZB.MOM.WW.OtOpcUa.Driver.FOCAS/Supervisor/Backoff.cs b/src/ZB.MOM.WW.OtOpcUa.Driver.FOCAS/Supervisor/Backoff.cs new file mode 100644 index 0000000..8ffa193 --- /dev/null +++ b/src/ZB.MOM.WW.OtOpcUa.Driver.FOCAS/Supervisor/Backoff.cs @@ -0,0 +1,30 @@ +namespace ZB.MOM.WW.OtOpcUa.Driver.FOCAS.Supervisor; + +/// +/// Respawn-with-backoff schedule for the FOCAS Host process. Matches Galaxy Tier-C: +/// 5s → 15s → 60s cap. A sustained stable run (default 2 min) resets the index so a +/// one-off crash after hours of steady-state doesn't start from the top of the ladder. +/// +public sealed class Backoff +{ + public static TimeSpan[] DefaultSequence { get; } = + [TimeSpan.FromSeconds(5), TimeSpan.FromSeconds(15), TimeSpan.FromSeconds(60)]; + + public TimeSpan StableRunThreshold { get; init; } = TimeSpan.FromMinutes(2); + + private readonly TimeSpan[] _sequence; + private int _index; + + public Backoff(TimeSpan[]? sequence = null) => _sequence = sequence ?? DefaultSequence; + + public TimeSpan Next() + { + var delay = _sequence[Math.Min(_index, _sequence.Length - 1)]; + _index++; + return delay; + } + + public void RecordStableRun() => _index = 0; + + public int AttemptIndex => _index; +} diff --git a/src/ZB.MOM.WW.OtOpcUa.Driver.FOCAS/Supervisor/CircuitBreaker.cs b/src/ZB.MOM.WW.OtOpcUa.Driver.FOCAS/Supervisor/CircuitBreaker.cs new file mode 100644 index 0000000..eeb1a7b --- /dev/null +++ b/src/ZB.MOM.WW.OtOpcUa.Driver.FOCAS/Supervisor/CircuitBreaker.cs @@ -0,0 +1,69 @@ +namespace ZB.MOM.WW.OtOpcUa.Driver.FOCAS.Supervisor; + +/// +/// Crash-loop circuit breaker for the FOCAS Host. Matches Galaxy Tier-C defaults: +/// 3 crashes within 5 minutes opens the breaker; cooldown escalates 1h → 4h → manual +/// reset. A sticky alert stays live until the operator explicitly clears it so +/// recurring crashes can't silently burn through the cooldown ladder overnight. +/// +public sealed class CircuitBreaker +{ + public int CrashesAllowedPerWindow { get; init; } = 3; + public TimeSpan Window { get; init; } = TimeSpan.FromMinutes(5); + + public TimeSpan[] CooldownEscalation { get; init; } = + [TimeSpan.FromHours(1), TimeSpan.FromHours(4), TimeSpan.MaxValue]; + + private readonly List _crashesUtc = []; + private DateTime? _openSinceUtc; + private int _escalationLevel; + public bool StickyAlertActive { get; private set; } + + /// + /// Records a crash + returns true if the supervisor may respawn. On + /// false, is how long to wait before + /// trying again (TimeSpan.MaxValue means manual reset required). + /// + public bool TryRecordCrash(DateTime utcNow, out TimeSpan cooldownRemaining) + { + if (_openSinceUtc is { } openedAt) + { + var cooldown = CooldownEscalation[Math.Min(_escalationLevel, CooldownEscalation.Length - 1)]; + if (cooldown == TimeSpan.MaxValue) + { + cooldownRemaining = TimeSpan.MaxValue; + return false; + } + if (utcNow - openedAt < cooldown) + { + cooldownRemaining = cooldown - (utcNow - openedAt); + return false; + } + + _openSinceUtc = null; + _escalationLevel++; + } + + _crashesUtc.RemoveAll(t => utcNow - t > Window); + _crashesUtc.Add(utcNow); + + if (_crashesUtc.Count > CrashesAllowedPerWindow) + { + _openSinceUtc = utcNow; + StickyAlertActive = true; + cooldownRemaining = CooldownEscalation[Math.Min(_escalationLevel, CooldownEscalation.Length - 1)]; + return false; + } + + cooldownRemaining = TimeSpan.Zero; + return true; + } + + public void ManualReset() + { + _crashesUtc.Clear(); + _openSinceUtc = null; + _escalationLevel = 0; + StickyAlertActive = false; + } +} diff --git a/src/ZB.MOM.WW.OtOpcUa.Driver.FOCAS/Supervisor/FocasHostSupervisor.cs b/src/ZB.MOM.WW.OtOpcUa.Driver.FOCAS/Supervisor/FocasHostSupervisor.cs new file mode 100644 index 0000000..fc58d38 --- /dev/null +++ b/src/ZB.MOM.WW.OtOpcUa.Driver.FOCAS/Supervisor/FocasHostSupervisor.cs @@ -0,0 +1,159 @@ +namespace ZB.MOM.WW.OtOpcUa.Driver.FOCAS.Supervisor; + +/// +/// Ties + + +/// + into one object the +/// driver asks for IFocasClients. On a detected crash (process exit or +/// heartbeat loss) the supervisor fans out BadCommunicationError to all +/// subscribers via the callback, then respawns with +/// backoff unless the breaker is open. +/// +/// +/// The supervisor itself is I/O-free — it doesn't know how to spawn processes, probe +/// pipes, or send heartbeats. Production wires the concrete +/// over FocasIpcClient + Process; +/// tests drive the same state machine with a deterministic launcher stub. +/// +public sealed class FocasHostSupervisor : IDisposable +{ + private readonly IHostProcessLauncher _launcher; + private readonly Backoff _backoff; + private readonly CircuitBreaker _breaker; + private readonly Func _clock; + private IFocasClient? _current; + private DateTime _currentStartedUtc; + private bool _disposed; + + public FocasHostSupervisor( + IHostProcessLauncher launcher, + Backoff? backoff = null, + CircuitBreaker? breaker = null, + Func? clock = null) + { + _launcher = launcher ?? throw new ArgumentNullException(nameof(launcher)); + _backoff = backoff ?? new Backoff(); + _breaker = breaker ?? new CircuitBreaker(); + _clock = clock ?? (() => DateTime.UtcNow); + } + + /// Raised with a short reason string whenever the Host goes unavailable (crash / heartbeat loss / breaker-open). + public event Action? OnUnavailable; + + /// Crash count observed in the current process lifetime. Exposed for /hosts Admin telemetry. + public int ObservedCrashes { get; private set; } + + /// true if the crash-loop breaker has latched a sticky alert that needs operator reset. + public bool StickyAlertActive => _breaker.StickyAlertActive; + + public int BackoffAttempt => _backoff.AttemptIndex; + + /// + /// Returns the current live client. If none, tries to launch — applying the + /// backoff schedule between attempts and stopping once the breaker opens. + /// + public async Task GetOrLaunchAsync(CancellationToken ct) + { + ThrowIfDisposed(); + if (_current is not null && _launcher.IsProcessAlive) return _current; + + return await LaunchWithBackoffAsync(ct).ConfigureAwait(false); + } + + /// + /// Called by the heartbeat task each time a miss threshold is crossed. + /// Treated as a crash: fan out Bad status + attempt respawn. + /// + public async Task NotifyHostDeadAsync(string reason, CancellationToken ct) + { + ThrowIfDisposed(); + OnUnavailable?.Invoke(reason); + ObservedCrashes++; + try { await _launcher.TerminateAsync(ct).ConfigureAwait(false); } + catch { /* best effort */ } + _current?.Dispose(); + _current = null; + + if (!_breaker.TryRecordCrash(_clock(), out var cooldown)) + { + OnUnavailable?.Invoke(cooldown == TimeSpan.MaxValue + ? "circuit-breaker-open-manual-reset-required" + : $"circuit-breaker-open-cooldown-{cooldown:g}"); + return; + } + // Successful crash recording — do not respawn synchronously; GetOrLaunchAsync will + // pick up the attempt on the next call. Keeps the fan-out fast. + } + + /// Operator action — clear the sticky alert + reset the breaker. + public void AcknowledgeAndReset() + { + _breaker.ManualReset(); + _backoff.RecordStableRun(); + } + + private async Task LaunchWithBackoffAsync(CancellationToken ct) + { + while (true) + { + if (_breaker.StickyAlertActive) + { + if (!_breaker.TryRecordCrash(_clock(), out var cooldown) && cooldown == TimeSpan.MaxValue) + throw new InvalidOperationException( + "FOCAS Host circuit breaker is open and awaiting manual reset. " + + "See Admin /hosts; call AcknowledgeAndReset after investigating the Host log."); + } + + try + { + _current = await _launcher.LaunchAsync(ct).ConfigureAwait(false); + _currentStartedUtc = _clock(); + + // If the launch sequence itself takes long enough to count as a stable run, + // reset the backoff ladder immediately. + if (_clock() - _currentStartedUtc >= _backoff.StableRunThreshold) + _backoff.RecordStableRun(); + + return _current; + } + catch (Exception ex) when (ex is not OperationCanceledException) + { + OnUnavailable?.Invoke($"launch-failed: {ex.Message}"); + ObservedCrashes++; + if (!_breaker.TryRecordCrash(_clock(), out var cooldown)) + { + var hint = cooldown == TimeSpan.MaxValue + ? "manual reset required" + : $"cooldown {cooldown:g}"; + throw new InvalidOperationException( + $"FOCAS Host circuit breaker opened after {ObservedCrashes} crashes — {hint}.", ex); + } + + var delay = _backoff.Next(); + await Task.Delay(delay, ct).ConfigureAwait(false); + } + } + } + + /// Called from the heartbeat loop after a successful ack run — relaxes the backoff ladder. + public void NotifyStableRun() + { + if (_current is null) return; + if (_clock() - _currentStartedUtc >= _backoff.StableRunThreshold) + _backoff.RecordStableRun(); + } + + public void Dispose() + { + if (_disposed) return; + _disposed = true; + try { _launcher.TerminateAsync(CancellationToken.None).GetAwaiter().GetResult(); } + catch { /* best effort */ } + _current?.Dispose(); + _current = null; + } + + private void ThrowIfDisposed() + { + if (_disposed) throw new ObjectDisposedException(nameof(FocasHostSupervisor)); + } +} diff --git a/src/ZB.MOM.WW.OtOpcUa.Driver.FOCAS/Supervisor/HeartbeatMonitor.cs b/src/ZB.MOM.WW.OtOpcUa.Driver.FOCAS/Supervisor/HeartbeatMonitor.cs new file mode 100644 index 0000000..5b43309 --- /dev/null +++ b/src/ZB.MOM.WW.OtOpcUa.Driver.FOCAS/Supervisor/HeartbeatMonitor.cs @@ -0,0 +1,29 @@ +namespace ZB.MOM.WW.OtOpcUa.Driver.FOCAS.Supervisor; + +/// +/// Tracks missed heartbeats from the FOCAS Host. 2s cadence + 3 consecutive misses = +/// host declared dead (~6s detection). Same defaults as Galaxy Tier-C so operators +/// see the same cadence across hosts on the /hosts Admin page. +/// +public sealed class HeartbeatMonitor +{ + public int MissesUntilDead { get; init; } = 3; + + public TimeSpan Cadence { get; init; } = TimeSpan.FromSeconds(2); + + public int ConsecutiveMisses { get; private set; } + public DateTime? LastAckUtc { get; private set; } + + public void RecordAck(DateTime utcNow) + { + ConsecutiveMisses = 0; + LastAckUtc = utcNow; + } + + /// Records a missed heartbeat; returns true when the death threshold is crossed. + public bool RecordMiss() + { + ConsecutiveMisses++; + return ConsecutiveMisses >= MissesUntilDead; + } +} diff --git a/src/ZB.MOM.WW.OtOpcUa.Driver.FOCAS/Supervisor/IHostProcessLauncher.cs b/src/ZB.MOM.WW.OtOpcUa.Driver.FOCAS/Supervisor/IHostProcessLauncher.cs new file mode 100644 index 0000000..50c4024 --- /dev/null +++ b/src/ZB.MOM.WW.OtOpcUa.Driver.FOCAS/Supervisor/IHostProcessLauncher.cs @@ -0,0 +1,32 @@ +namespace ZB.MOM.WW.OtOpcUa.Driver.FOCAS.Supervisor; + +/// +/// Abstraction over the act of spawning a FOCAS Host process and obtaining an +/// connected to it. Production wires this to a real +/// Process.Start + FocasIpcClient.ConnectAsync; tests use a fake that +/// exposes deterministic failure modes so the supervisor logic can be stressed +/// without spawning actual exes. +/// +public interface IHostProcessLauncher +{ + /// + /// Spawn a new Host process (if one isn't already running) and return a live + /// client session. Throws on unrecoverable errors; transient errors (e.g. Host + /// not ready yet) should throw so the supervisor + /// applies the backoff ladder. + /// + Task LaunchAsync(CancellationToken ct); + + /// + /// Terminate the Host process if one is running. Called on Dispose and after a + /// heartbeat loss is detected. + /// + Task TerminateAsync(CancellationToken ct); + + /// + /// true when the most recently spawned Host process is still alive. + /// Supervisor polls this at heartbeat cadence; going false without a + /// clean shutdown counts as a crash. + /// + bool IsProcessAlive { get; } +} diff --git a/tests/ZB.MOM.WW.OtOpcUa.Driver.FOCAS.Tests/SupervisorTests.cs b/tests/ZB.MOM.WW.OtOpcUa.Driver.FOCAS.Tests/SupervisorTests.cs new file mode 100644 index 0000000..9694e7f --- /dev/null +++ b/tests/ZB.MOM.WW.OtOpcUa.Driver.FOCAS.Tests/SupervisorTests.cs @@ -0,0 +1,249 @@ +using Shouldly; +using Xunit; +using ZB.MOM.WW.OtOpcUa.Driver.FOCAS.Supervisor; + +namespace ZB.MOM.WW.OtOpcUa.Driver.FOCAS.Tests; + +[Trait("Category", "Unit")] +public sealed class BackoffTests +{ + [Fact] + public void Default_sequence_is_5s_15s_60s_then_clamped() + { + var b = new Backoff(); + b.Next().ShouldBe(TimeSpan.FromSeconds(5)); + b.Next().ShouldBe(TimeSpan.FromSeconds(15)); + b.Next().ShouldBe(TimeSpan.FromSeconds(60)); + b.Next().ShouldBe(TimeSpan.FromSeconds(60)); + b.Next().ShouldBe(TimeSpan.FromSeconds(60)); + } + + [Fact] + public void RecordStableRun_resets_the_ladder_to_the_start() + { + var b = new Backoff(); + b.Next(); b.Next(); + b.AttemptIndex.ShouldBe(2); + b.RecordStableRun(); + b.AttemptIndex.ShouldBe(0); + b.Next().ShouldBe(TimeSpan.FromSeconds(5)); + } +} + +[Trait("Category", "Unit")] +public sealed class CircuitBreakerTests +{ + [Fact] + public void Allows_crashes_below_threshold() + { + var b = new CircuitBreaker(); + var now = DateTime.UtcNow; + b.TryRecordCrash(now, out _).ShouldBeTrue(); + b.TryRecordCrash(now.AddSeconds(1), out _).ShouldBeTrue(); + b.TryRecordCrash(now.AddSeconds(2), out _).ShouldBeTrue(); + b.StickyAlertActive.ShouldBeFalse(); + } + + [Fact] + public void Opens_when_exceeding_threshold_in_window() + { + var b = new CircuitBreaker(); + var now = DateTime.UtcNow; + b.TryRecordCrash(now, out _); + b.TryRecordCrash(now.AddSeconds(1), out _); + b.TryRecordCrash(now.AddSeconds(2), out _); + b.TryRecordCrash(now.AddSeconds(3), out var cooldown).ShouldBeFalse(); + cooldown.ShouldBe(TimeSpan.FromHours(1)); + b.StickyAlertActive.ShouldBeTrue(); + } + + [Fact] + public void Escalates_cooldown_after_second_open() + { + var b = new CircuitBreaker(); + var t0 = new DateTime(2026, 1, 1, 0, 0, 0, DateTimeKind.Utc); + // First burst — 4 crashes opens breaker with 1h cooldown. + for (var i = 0; i < 4; i++) b.TryRecordCrash(t0.AddSeconds(i), out _); + b.StickyAlertActive.ShouldBeTrue(); + + // Wait past cooldown. The first crash after cooldown-elapsed resets _openSinceUtc and + // bumps escalation level; the next 3 crashes then re-open with the escalated 4h cooldown. + b.TryRecordCrash(t0.AddHours(1).AddMinutes(1), out _); + var t1 = t0.AddHours(1).AddMinutes(1).AddSeconds(1); + b.TryRecordCrash(t1, out _); + b.TryRecordCrash(t1.AddSeconds(1), out _); + b.TryRecordCrash(t1.AddSeconds(2), out var cooldown).ShouldBeFalse(); + cooldown.ShouldBe(TimeSpan.FromHours(4)); + } + + [Fact] + public void ManualReset_clears_everything() + { + var b = new CircuitBreaker(); + var now = DateTime.UtcNow; + for (var i = 0; i < 5; i++) b.TryRecordCrash(now.AddSeconds(i), out _); + b.StickyAlertActive.ShouldBeTrue(); + b.ManualReset(); + b.StickyAlertActive.ShouldBeFalse(); + b.TryRecordCrash(now.AddSeconds(10), out _).ShouldBeTrue(); + } +} + +[Trait("Category", "Unit")] +public sealed class HeartbeatMonitorTests +{ + [Fact] + public void Three_consecutive_misses_declares_dead() + { + var m = new HeartbeatMonitor(); + m.RecordMiss().ShouldBeFalse(); + m.RecordMiss().ShouldBeFalse(); + m.RecordMiss().ShouldBeTrue(); + } + + [Fact] + public void Ack_resets_the_miss_counter() + { + var m = new HeartbeatMonitor(); + m.RecordMiss(); m.RecordMiss(); + m.ConsecutiveMisses.ShouldBe(2); + m.RecordAck(DateTime.UtcNow); + m.ConsecutiveMisses.ShouldBe(0); + } +} + +[Trait("Category", "Unit")] +public sealed class FocasHostSupervisorTests +{ + private sealed class FakeLauncher : IHostProcessLauncher + { + public int LaunchAttempts { get; private set; } + public int Terminations { get; private set; } + public Queue> Plan { get; } = new(); + public bool IsProcessAlive { get; set; } + + public Task LaunchAsync(CancellationToken ct) + { + LaunchAttempts++; + if (Plan.Count == 0) throw new InvalidOperationException("FakeLauncher plan exhausted"); + var next = Plan.Dequeue()(); + IsProcessAlive = true; + return Task.FromResult(next); + } + + public Task TerminateAsync(CancellationToken ct) + { + Terminations++; + IsProcessAlive = false; + return Task.CompletedTask; + } + } + + private sealed class StubFocasClient : IFocasClient + { + public bool IsConnected => true; + public Task ConnectAsync(FocasHostAddress address, TimeSpan timeout, CancellationToken ct) => Task.CompletedTask; + public Task<(object? value, uint status)> ReadAsync(FocasAddress a, FocasDataType t, CancellationToken ct) => + Task.FromResult<(object?, uint)>((0, 0)); + public Task WriteAsync(FocasAddress a, FocasDataType t, object? v, CancellationToken ct) => Task.FromResult(0u); + public Task ProbeAsync(CancellationToken ct) => Task.FromResult(true); + public void Dispose() { } + } + + [Fact] + public async Task GetOrLaunch_returns_client_on_first_success() + { + var launcher = new FakeLauncher(); + launcher.Plan.Enqueue(() => new StubFocasClient()); + var supervisor = new FocasHostSupervisor(launcher); + var client = await supervisor.GetOrLaunchAsync(TestContext.Current.CancellationToken); + client.ShouldNotBeNull(); + launcher.LaunchAttempts.ShouldBe(1); + } + + [Fact] + public async Task GetOrLaunch_retries_after_transient_failure_with_backoff() + { + var launcher = new FakeLauncher(); + launcher.Plan.Enqueue(() => throw new TimeoutException("pipe not ready")); + launcher.Plan.Enqueue(() => new StubFocasClient()); + + var backoff = new Backoff([TimeSpan.FromMilliseconds(10), TimeSpan.FromMilliseconds(20)]); + var supervisor = new FocasHostSupervisor(launcher, backoff); + + var unavailableMessages = new List(); + supervisor.OnUnavailable += m => unavailableMessages.Add(m); + + var client = await supervisor.GetOrLaunchAsync(TestContext.Current.CancellationToken); + client.ShouldNotBeNull(); + launcher.LaunchAttempts.ShouldBe(2); + unavailableMessages.Count.ShouldBe(1); + unavailableMessages[0].ShouldContain("launch-failed"); + } + + [Fact] + public async Task Repeated_launch_failures_open_breaker_and_surface_InvalidOperation() + { + var launcher = new FakeLauncher(); + for (var i = 0; i < 10; i++) + launcher.Plan.Enqueue(() => throw new InvalidOperationException("simulated host refused")); + + var supervisor = new FocasHostSupervisor( + launcher, + backoff: new Backoff([TimeSpan.FromMilliseconds(1)]), + breaker: new CircuitBreaker { CrashesAllowedPerWindow = 2, Window = TimeSpan.FromMinutes(5) }); + + var ex = await Should.ThrowAsync(async () => + await supervisor.GetOrLaunchAsync(TestContext.Current.CancellationToken)); + ex.Message.ShouldContain("circuit breaker"); + supervisor.StickyAlertActive.ShouldBeTrue(); + } + + [Fact] + public async Task NotifyHostDeadAsync_terminates_current_and_fans_out_unavailable() + { + var launcher = new FakeLauncher(); + launcher.Plan.Enqueue(() => new StubFocasClient()); + var supervisor = new FocasHostSupervisor(launcher); + + var messages = new List(); + supervisor.OnUnavailable += m => messages.Add(m); + await supervisor.GetOrLaunchAsync(TestContext.Current.CancellationToken); + + await supervisor.NotifyHostDeadAsync("heartbeat-loss", TestContext.Current.CancellationToken); + + launcher.Terminations.ShouldBe(1); + messages.ShouldContain("heartbeat-loss"); + supervisor.ObservedCrashes.ShouldBe(1); + } + + [Fact] + public async Task AcknowledgeAndReset_clears_sticky_alert() + { + var launcher = new FakeLauncher(); + for (var i = 0; i < 10; i++) + launcher.Plan.Enqueue(() => throw new InvalidOperationException("refused")); + var supervisor = new FocasHostSupervisor( + launcher, + backoff: new Backoff([TimeSpan.FromMilliseconds(1)]), + breaker: new CircuitBreaker { CrashesAllowedPerWindow = 1 }); + + try { await supervisor.GetOrLaunchAsync(TestContext.Current.CancellationToken); } catch { } + supervisor.StickyAlertActive.ShouldBeTrue(); + + supervisor.AcknowledgeAndReset(); + supervisor.StickyAlertActive.ShouldBeFalse(); + } + + [Fact] + public async Task Dispose_terminates_host_process() + { + var launcher = new FakeLauncher(); + launcher.Plan.Enqueue(() => new StubFocasClient()); + var supervisor = new FocasHostSupervisor(launcher); + await supervisor.GetOrLaunchAsync(TestContext.Current.CancellationToken); + + supervisor.Dispose(); + launcher.Terminations.ShouldBe(1); + } +} -- 2.49.1