FOCAS Tier-C PR D — supervisor + backoff + crash-loop breaker #172
30
src/ZB.MOM.WW.OtOpcUa.Driver.FOCAS/Supervisor/Backoff.cs
Normal file
30
src/ZB.MOM.WW.OtOpcUa.Driver.FOCAS/Supervisor/Backoff.cs
Normal file
@@ -0,0 +1,30 @@
|
|||||||
|
namespace ZB.MOM.WW.OtOpcUa.Driver.FOCAS.Supervisor;
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Respawn-with-backoff schedule for the FOCAS Host process. Matches Galaxy Tier-C:
|
||||||
|
/// 5s → 15s → 60s cap. A sustained stable run (default 2 min) resets the index so a
|
||||||
|
/// one-off crash after hours of steady-state doesn't start from the top of the ladder.
|
||||||
|
/// </summary>
|
||||||
|
public sealed class Backoff
|
||||||
|
{
|
||||||
|
public static TimeSpan[] DefaultSequence { get; } =
|
||||||
|
[TimeSpan.FromSeconds(5), TimeSpan.FromSeconds(15), TimeSpan.FromSeconds(60)];
|
||||||
|
|
||||||
|
public TimeSpan StableRunThreshold { get; init; } = TimeSpan.FromMinutes(2);
|
||||||
|
|
||||||
|
private readonly TimeSpan[] _sequence;
|
||||||
|
private int _index;
|
||||||
|
|
||||||
|
public Backoff(TimeSpan[]? sequence = null) => _sequence = sequence ?? DefaultSequence;
|
||||||
|
|
||||||
|
public TimeSpan Next()
|
||||||
|
{
|
||||||
|
var delay = _sequence[Math.Min(_index, _sequence.Length - 1)];
|
||||||
|
_index++;
|
||||||
|
return delay;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void RecordStableRun() => _index = 0;
|
||||||
|
|
||||||
|
public int AttemptIndex => _index;
|
||||||
|
}
|
||||||
@@ -0,0 +1,69 @@
|
|||||||
|
namespace ZB.MOM.WW.OtOpcUa.Driver.FOCAS.Supervisor;
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Crash-loop circuit breaker for the FOCAS Host. Matches Galaxy Tier-C defaults:
|
||||||
|
/// 3 crashes within 5 minutes opens the breaker; cooldown escalates 1h → 4h → manual
|
||||||
|
/// reset. A sticky alert stays live until the operator explicitly clears it so
|
||||||
|
/// recurring crashes can't silently burn through the cooldown ladder overnight.
|
||||||
|
/// </summary>
|
||||||
|
public sealed class CircuitBreaker
|
||||||
|
{
|
||||||
|
public int CrashesAllowedPerWindow { get; init; } = 3;
|
||||||
|
public TimeSpan Window { get; init; } = TimeSpan.FromMinutes(5);
|
||||||
|
|
||||||
|
public TimeSpan[] CooldownEscalation { get; init; } =
|
||||||
|
[TimeSpan.FromHours(1), TimeSpan.FromHours(4), TimeSpan.MaxValue];
|
||||||
|
|
||||||
|
private readonly List<DateTime> _crashesUtc = [];
|
||||||
|
private DateTime? _openSinceUtc;
|
||||||
|
private int _escalationLevel;
|
||||||
|
public bool StickyAlertActive { get; private set; }
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Records a crash + returns <c>true</c> if the supervisor may respawn. On
|
||||||
|
/// <c>false</c>, <paramref name="cooldownRemaining"/> is how long to wait before
|
||||||
|
/// trying again (<c>TimeSpan.MaxValue</c> means manual reset required).
|
||||||
|
/// </summary>
|
||||||
|
public bool TryRecordCrash(DateTime utcNow, out TimeSpan cooldownRemaining)
|
||||||
|
{
|
||||||
|
if (_openSinceUtc is { } openedAt)
|
||||||
|
{
|
||||||
|
var cooldown = CooldownEscalation[Math.Min(_escalationLevel, CooldownEscalation.Length - 1)];
|
||||||
|
if (cooldown == TimeSpan.MaxValue)
|
||||||
|
{
|
||||||
|
cooldownRemaining = TimeSpan.MaxValue;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (utcNow - openedAt < cooldown)
|
||||||
|
{
|
||||||
|
cooldownRemaining = cooldown - (utcNow - openedAt);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
_openSinceUtc = null;
|
||||||
|
_escalationLevel++;
|
||||||
|
}
|
||||||
|
|
||||||
|
_crashesUtc.RemoveAll(t => utcNow - t > Window);
|
||||||
|
_crashesUtc.Add(utcNow);
|
||||||
|
|
||||||
|
if (_crashesUtc.Count > CrashesAllowedPerWindow)
|
||||||
|
{
|
||||||
|
_openSinceUtc = utcNow;
|
||||||
|
StickyAlertActive = true;
|
||||||
|
cooldownRemaining = CooldownEscalation[Math.Min(_escalationLevel, CooldownEscalation.Length - 1)];
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
cooldownRemaining = TimeSpan.Zero;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void ManualReset()
|
||||||
|
{
|
||||||
|
_crashesUtc.Clear();
|
||||||
|
_openSinceUtc = null;
|
||||||
|
_escalationLevel = 0;
|
||||||
|
StickyAlertActive = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,159 @@
|
|||||||
|
namespace ZB.MOM.WW.OtOpcUa.Driver.FOCAS.Supervisor;
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Ties <see cref="IHostProcessLauncher"/> + <see cref="Backoff"/> +
|
||||||
|
/// <see cref="CircuitBreaker"/> + <see cref="HeartbeatMonitor"/> into one object the
|
||||||
|
/// driver asks for <c>IFocasClient</c>s. On a detected crash (process exit or
|
||||||
|
/// heartbeat loss) the supervisor fans out <c>BadCommunicationError</c> to all
|
||||||
|
/// subscribers via the <see cref="OnUnavailable"/> callback, then respawns with
|
||||||
|
/// backoff unless the breaker is open.
|
||||||
|
/// </summary>
|
||||||
|
/// <remarks>
|
||||||
|
/// The supervisor itself is I/O-free — it doesn't know how to spawn processes, probe
|
||||||
|
/// pipes, or send heartbeats. Production wires the concrete
|
||||||
|
/// <see cref="IHostProcessLauncher"/> over <c>FocasIpcClient</c> + <c>Process</c>;
|
||||||
|
/// tests drive the same state machine with a deterministic launcher stub.
|
||||||
|
/// </remarks>
|
||||||
|
public sealed class FocasHostSupervisor : IDisposable
|
||||||
|
{
|
||||||
|
private readonly IHostProcessLauncher _launcher;
|
||||||
|
private readonly Backoff _backoff;
|
||||||
|
private readonly CircuitBreaker _breaker;
|
||||||
|
private readonly Func<DateTime> _clock;
|
||||||
|
private IFocasClient? _current;
|
||||||
|
private DateTime _currentStartedUtc;
|
||||||
|
private bool _disposed;
|
||||||
|
|
||||||
|
public FocasHostSupervisor(
|
||||||
|
IHostProcessLauncher launcher,
|
||||||
|
Backoff? backoff = null,
|
||||||
|
CircuitBreaker? breaker = null,
|
||||||
|
Func<DateTime>? clock = null)
|
||||||
|
{
|
||||||
|
_launcher = launcher ?? throw new ArgumentNullException(nameof(launcher));
|
||||||
|
_backoff = backoff ?? new Backoff();
|
||||||
|
_breaker = breaker ?? new CircuitBreaker();
|
||||||
|
_clock = clock ?? (() => DateTime.UtcNow);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>Raised with a short reason string whenever the Host goes unavailable (crash / heartbeat loss / breaker-open).</summary>
|
||||||
|
public event Action<string>? OnUnavailable;
|
||||||
|
|
||||||
|
/// <summary>Crash count observed in the current process lifetime. Exposed for /hosts Admin telemetry.</summary>
|
||||||
|
public int ObservedCrashes { get; private set; }
|
||||||
|
|
||||||
|
/// <summary><c>true</c> if the crash-loop breaker has latched a sticky alert that needs operator reset.</summary>
|
||||||
|
public bool StickyAlertActive => _breaker.StickyAlertActive;
|
||||||
|
|
||||||
|
public int BackoffAttempt => _backoff.AttemptIndex;
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Returns the current live client. If none, tries to launch — applying the
|
||||||
|
/// backoff schedule between attempts and stopping once the breaker opens.
|
||||||
|
/// </summary>
|
||||||
|
public async Task<IFocasClient> GetOrLaunchAsync(CancellationToken ct)
|
||||||
|
{
|
||||||
|
ThrowIfDisposed();
|
||||||
|
if (_current is not null && _launcher.IsProcessAlive) return _current;
|
||||||
|
|
||||||
|
return await LaunchWithBackoffAsync(ct).ConfigureAwait(false);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Called by the heartbeat task each time a miss threshold is crossed.
|
||||||
|
/// Treated as a crash: fan out Bad status + attempt respawn.
|
||||||
|
/// </summary>
|
||||||
|
public async Task NotifyHostDeadAsync(string reason, CancellationToken ct)
|
||||||
|
{
|
||||||
|
ThrowIfDisposed();
|
||||||
|
OnUnavailable?.Invoke(reason);
|
||||||
|
ObservedCrashes++;
|
||||||
|
try { await _launcher.TerminateAsync(ct).ConfigureAwait(false); }
|
||||||
|
catch { /* best effort */ }
|
||||||
|
_current?.Dispose();
|
||||||
|
_current = null;
|
||||||
|
|
||||||
|
if (!_breaker.TryRecordCrash(_clock(), out var cooldown))
|
||||||
|
{
|
||||||
|
OnUnavailable?.Invoke(cooldown == TimeSpan.MaxValue
|
||||||
|
? "circuit-breaker-open-manual-reset-required"
|
||||||
|
: $"circuit-breaker-open-cooldown-{cooldown:g}");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
// Successful crash recording — do not respawn synchronously; GetOrLaunchAsync will
|
||||||
|
// pick up the attempt on the next call. Keeps the fan-out fast.
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>Operator action — clear the sticky alert + reset the breaker.</summary>
|
||||||
|
public void AcknowledgeAndReset()
|
||||||
|
{
|
||||||
|
_breaker.ManualReset();
|
||||||
|
_backoff.RecordStableRun();
|
||||||
|
}
|
||||||
|
|
||||||
|
private async Task<IFocasClient> LaunchWithBackoffAsync(CancellationToken ct)
|
||||||
|
{
|
||||||
|
while (true)
|
||||||
|
{
|
||||||
|
if (_breaker.StickyAlertActive)
|
||||||
|
{
|
||||||
|
if (!_breaker.TryRecordCrash(_clock(), out var cooldown) && cooldown == TimeSpan.MaxValue)
|
||||||
|
throw new InvalidOperationException(
|
||||||
|
"FOCAS Host circuit breaker is open and awaiting manual reset. " +
|
||||||
|
"See Admin /hosts; call AcknowledgeAndReset after investigating the Host log.");
|
||||||
|
}
|
||||||
|
|
||||||
|
try
|
||||||
|
{
|
||||||
|
_current = await _launcher.LaunchAsync(ct).ConfigureAwait(false);
|
||||||
|
_currentStartedUtc = _clock();
|
||||||
|
|
||||||
|
// If the launch sequence itself takes long enough to count as a stable run,
|
||||||
|
// reset the backoff ladder immediately.
|
||||||
|
if (_clock() - _currentStartedUtc >= _backoff.StableRunThreshold)
|
||||||
|
_backoff.RecordStableRun();
|
||||||
|
|
||||||
|
return _current;
|
||||||
|
}
|
||||||
|
catch (Exception ex) when (ex is not OperationCanceledException)
|
||||||
|
{
|
||||||
|
OnUnavailable?.Invoke($"launch-failed: {ex.Message}");
|
||||||
|
ObservedCrashes++;
|
||||||
|
if (!_breaker.TryRecordCrash(_clock(), out var cooldown))
|
||||||
|
{
|
||||||
|
var hint = cooldown == TimeSpan.MaxValue
|
||||||
|
? "manual reset required"
|
||||||
|
: $"cooldown {cooldown:g}";
|
||||||
|
throw new InvalidOperationException(
|
||||||
|
$"FOCAS Host circuit breaker opened after {ObservedCrashes} crashes — {hint}.", ex);
|
||||||
|
}
|
||||||
|
|
||||||
|
var delay = _backoff.Next();
|
||||||
|
await Task.Delay(delay, ct).ConfigureAwait(false);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>Called from the heartbeat loop after a successful ack run — relaxes the backoff ladder.</summary>
|
||||||
|
public void NotifyStableRun()
|
||||||
|
{
|
||||||
|
if (_current is null) return;
|
||||||
|
if (_clock() - _currentStartedUtc >= _backoff.StableRunThreshold)
|
||||||
|
_backoff.RecordStableRun();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void Dispose()
|
||||||
|
{
|
||||||
|
if (_disposed) return;
|
||||||
|
_disposed = true;
|
||||||
|
try { _launcher.TerminateAsync(CancellationToken.None).GetAwaiter().GetResult(); }
|
||||||
|
catch { /* best effort */ }
|
||||||
|
_current?.Dispose();
|
||||||
|
_current = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void ThrowIfDisposed()
|
||||||
|
{
|
||||||
|
if (_disposed) throw new ObjectDisposedException(nameof(FocasHostSupervisor));
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,29 @@
|
|||||||
|
namespace ZB.MOM.WW.OtOpcUa.Driver.FOCAS.Supervisor;
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Tracks missed heartbeats from the FOCAS Host. 2s cadence + 3 consecutive misses =
|
||||||
|
/// host declared dead (~6s detection). Same defaults as Galaxy Tier-C so operators
|
||||||
|
/// see the same cadence across hosts on the /hosts Admin page.
|
||||||
|
/// </summary>
|
||||||
|
public sealed class HeartbeatMonitor
|
||||||
|
{
|
||||||
|
public int MissesUntilDead { get; init; } = 3;
|
||||||
|
|
||||||
|
public TimeSpan Cadence { get; init; } = TimeSpan.FromSeconds(2);
|
||||||
|
|
||||||
|
public int ConsecutiveMisses { get; private set; }
|
||||||
|
public DateTime? LastAckUtc { get; private set; }
|
||||||
|
|
||||||
|
public void RecordAck(DateTime utcNow)
|
||||||
|
{
|
||||||
|
ConsecutiveMisses = 0;
|
||||||
|
LastAckUtc = utcNow;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>Records a missed heartbeat; returns <c>true</c> when the death threshold is crossed.</summary>
|
||||||
|
public bool RecordMiss()
|
||||||
|
{
|
||||||
|
ConsecutiveMisses++;
|
||||||
|
return ConsecutiveMisses >= MissesUntilDead;
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,32 @@
|
|||||||
|
namespace ZB.MOM.WW.OtOpcUa.Driver.FOCAS.Supervisor;
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Abstraction over the act of spawning a FOCAS Host process and obtaining an
|
||||||
|
/// <see cref="IFocasClient"/> connected to it. Production wires this to a real
|
||||||
|
/// <c>Process.Start</c> + <c>FocasIpcClient.ConnectAsync</c>; tests use a fake that
|
||||||
|
/// exposes deterministic failure modes so the supervisor logic can be stressed
|
||||||
|
/// without spawning actual exes.
|
||||||
|
/// </summary>
|
||||||
|
public interface IHostProcessLauncher
|
||||||
|
{
|
||||||
|
/// <summary>
|
||||||
|
/// Spawn a new Host process (if one isn't already running) and return a live
|
||||||
|
/// client session. Throws on unrecoverable errors; transient errors (e.g. Host
|
||||||
|
/// not ready yet) should throw <see cref="TimeoutException"/> so the supervisor
|
||||||
|
/// applies the backoff ladder.
|
||||||
|
/// </summary>
|
||||||
|
Task<IFocasClient> LaunchAsync(CancellationToken ct);
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Terminate the Host process if one is running. Called on Dispose and after a
|
||||||
|
/// heartbeat loss is detected.
|
||||||
|
/// </summary>
|
||||||
|
Task TerminateAsync(CancellationToken ct);
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// <c>true</c> when the most recently spawned Host process is still alive.
|
||||||
|
/// Supervisor polls this at heartbeat cadence; going <c>false</c> without a
|
||||||
|
/// clean shutdown counts as a crash.
|
||||||
|
/// </summary>
|
||||||
|
bool IsProcessAlive { get; }
|
||||||
|
}
|
||||||
249
tests/ZB.MOM.WW.OtOpcUa.Driver.FOCAS.Tests/SupervisorTests.cs
Normal file
249
tests/ZB.MOM.WW.OtOpcUa.Driver.FOCAS.Tests/SupervisorTests.cs
Normal file
@@ -0,0 +1,249 @@
|
|||||||
|
using Shouldly;
|
||||||
|
using Xunit;
|
||||||
|
using ZB.MOM.WW.OtOpcUa.Driver.FOCAS.Supervisor;
|
||||||
|
|
||||||
|
namespace ZB.MOM.WW.OtOpcUa.Driver.FOCAS.Tests;
|
||||||
|
|
||||||
|
[Trait("Category", "Unit")]
|
||||||
|
public sealed class BackoffTests
|
||||||
|
{
|
||||||
|
[Fact]
|
||||||
|
public void Default_sequence_is_5s_15s_60s_then_clamped()
|
||||||
|
{
|
||||||
|
var b = new Backoff();
|
||||||
|
b.Next().ShouldBe(TimeSpan.FromSeconds(5));
|
||||||
|
b.Next().ShouldBe(TimeSpan.FromSeconds(15));
|
||||||
|
b.Next().ShouldBe(TimeSpan.FromSeconds(60));
|
||||||
|
b.Next().ShouldBe(TimeSpan.FromSeconds(60));
|
||||||
|
b.Next().ShouldBe(TimeSpan.FromSeconds(60));
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public void RecordStableRun_resets_the_ladder_to_the_start()
|
||||||
|
{
|
||||||
|
var b = new Backoff();
|
||||||
|
b.Next(); b.Next();
|
||||||
|
b.AttemptIndex.ShouldBe(2);
|
||||||
|
b.RecordStableRun();
|
||||||
|
b.AttemptIndex.ShouldBe(0);
|
||||||
|
b.Next().ShouldBe(TimeSpan.FromSeconds(5));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
[Trait("Category", "Unit")]
|
||||||
|
public sealed class CircuitBreakerTests
|
||||||
|
{
|
||||||
|
[Fact]
|
||||||
|
public void Allows_crashes_below_threshold()
|
||||||
|
{
|
||||||
|
var b = new CircuitBreaker();
|
||||||
|
var now = DateTime.UtcNow;
|
||||||
|
b.TryRecordCrash(now, out _).ShouldBeTrue();
|
||||||
|
b.TryRecordCrash(now.AddSeconds(1), out _).ShouldBeTrue();
|
||||||
|
b.TryRecordCrash(now.AddSeconds(2), out _).ShouldBeTrue();
|
||||||
|
b.StickyAlertActive.ShouldBeFalse();
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public void Opens_when_exceeding_threshold_in_window()
|
||||||
|
{
|
||||||
|
var b = new CircuitBreaker();
|
||||||
|
var now = DateTime.UtcNow;
|
||||||
|
b.TryRecordCrash(now, out _);
|
||||||
|
b.TryRecordCrash(now.AddSeconds(1), out _);
|
||||||
|
b.TryRecordCrash(now.AddSeconds(2), out _);
|
||||||
|
b.TryRecordCrash(now.AddSeconds(3), out var cooldown).ShouldBeFalse();
|
||||||
|
cooldown.ShouldBe(TimeSpan.FromHours(1));
|
||||||
|
b.StickyAlertActive.ShouldBeTrue();
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public void Escalates_cooldown_after_second_open()
|
||||||
|
{
|
||||||
|
var b = new CircuitBreaker();
|
||||||
|
var t0 = new DateTime(2026, 1, 1, 0, 0, 0, DateTimeKind.Utc);
|
||||||
|
// First burst — 4 crashes opens breaker with 1h cooldown.
|
||||||
|
for (var i = 0; i < 4; i++) b.TryRecordCrash(t0.AddSeconds(i), out _);
|
||||||
|
b.StickyAlertActive.ShouldBeTrue();
|
||||||
|
|
||||||
|
// Wait past cooldown. The first crash after cooldown-elapsed resets _openSinceUtc and
|
||||||
|
// bumps escalation level; the next 3 crashes then re-open with the escalated 4h cooldown.
|
||||||
|
b.TryRecordCrash(t0.AddHours(1).AddMinutes(1), out _);
|
||||||
|
var t1 = t0.AddHours(1).AddMinutes(1).AddSeconds(1);
|
||||||
|
b.TryRecordCrash(t1, out _);
|
||||||
|
b.TryRecordCrash(t1.AddSeconds(1), out _);
|
||||||
|
b.TryRecordCrash(t1.AddSeconds(2), out var cooldown).ShouldBeFalse();
|
||||||
|
cooldown.ShouldBe(TimeSpan.FromHours(4));
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public void ManualReset_clears_everything()
|
||||||
|
{
|
||||||
|
var b = new CircuitBreaker();
|
||||||
|
var now = DateTime.UtcNow;
|
||||||
|
for (var i = 0; i < 5; i++) b.TryRecordCrash(now.AddSeconds(i), out _);
|
||||||
|
b.StickyAlertActive.ShouldBeTrue();
|
||||||
|
b.ManualReset();
|
||||||
|
b.StickyAlertActive.ShouldBeFalse();
|
||||||
|
b.TryRecordCrash(now.AddSeconds(10), out _).ShouldBeTrue();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
[Trait("Category", "Unit")]
|
||||||
|
public sealed class HeartbeatMonitorTests
|
||||||
|
{
|
||||||
|
[Fact]
|
||||||
|
public void Three_consecutive_misses_declares_dead()
|
||||||
|
{
|
||||||
|
var m = new HeartbeatMonitor();
|
||||||
|
m.RecordMiss().ShouldBeFalse();
|
||||||
|
m.RecordMiss().ShouldBeFalse();
|
||||||
|
m.RecordMiss().ShouldBeTrue();
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public void Ack_resets_the_miss_counter()
|
||||||
|
{
|
||||||
|
var m = new HeartbeatMonitor();
|
||||||
|
m.RecordMiss(); m.RecordMiss();
|
||||||
|
m.ConsecutiveMisses.ShouldBe(2);
|
||||||
|
m.RecordAck(DateTime.UtcNow);
|
||||||
|
m.ConsecutiveMisses.ShouldBe(0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
[Trait("Category", "Unit")]
|
||||||
|
public sealed class FocasHostSupervisorTests
|
||||||
|
{
|
||||||
|
private sealed class FakeLauncher : IHostProcessLauncher
|
||||||
|
{
|
||||||
|
public int LaunchAttempts { get; private set; }
|
||||||
|
public int Terminations { get; private set; }
|
||||||
|
public Queue<Func<IFocasClient>> Plan { get; } = new();
|
||||||
|
public bool IsProcessAlive { get; set; }
|
||||||
|
|
||||||
|
public Task<IFocasClient> LaunchAsync(CancellationToken ct)
|
||||||
|
{
|
||||||
|
LaunchAttempts++;
|
||||||
|
if (Plan.Count == 0) throw new InvalidOperationException("FakeLauncher plan exhausted");
|
||||||
|
var next = Plan.Dequeue()();
|
||||||
|
IsProcessAlive = true;
|
||||||
|
return Task.FromResult(next);
|
||||||
|
}
|
||||||
|
|
||||||
|
public Task TerminateAsync(CancellationToken ct)
|
||||||
|
{
|
||||||
|
Terminations++;
|
||||||
|
IsProcessAlive = false;
|
||||||
|
return Task.CompletedTask;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private sealed class StubFocasClient : IFocasClient
|
||||||
|
{
|
||||||
|
public bool IsConnected => true;
|
||||||
|
public Task ConnectAsync(FocasHostAddress address, TimeSpan timeout, CancellationToken ct) => Task.CompletedTask;
|
||||||
|
public Task<(object? value, uint status)> ReadAsync(FocasAddress a, FocasDataType t, CancellationToken ct) =>
|
||||||
|
Task.FromResult<(object?, uint)>((0, 0));
|
||||||
|
public Task<uint> WriteAsync(FocasAddress a, FocasDataType t, object? v, CancellationToken ct) => Task.FromResult(0u);
|
||||||
|
public Task<bool> ProbeAsync(CancellationToken ct) => Task.FromResult(true);
|
||||||
|
public void Dispose() { }
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public async Task GetOrLaunch_returns_client_on_first_success()
|
||||||
|
{
|
||||||
|
var launcher = new FakeLauncher();
|
||||||
|
launcher.Plan.Enqueue(() => new StubFocasClient());
|
||||||
|
var supervisor = new FocasHostSupervisor(launcher);
|
||||||
|
var client = await supervisor.GetOrLaunchAsync(TestContext.Current.CancellationToken);
|
||||||
|
client.ShouldNotBeNull();
|
||||||
|
launcher.LaunchAttempts.ShouldBe(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public async Task GetOrLaunch_retries_after_transient_failure_with_backoff()
|
||||||
|
{
|
||||||
|
var launcher = new FakeLauncher();
|
||||||
|
launcher.Plan.Enqueue(() => throw new TimeoutException("pipe not ready"));
|
||||||
|
launcher.Plan.Enqueue(() => new StubFocasClient());
|
||||||
|
|
||||||
|
var backoff = new Backoff([TimeSpan.FromMilliseconds(10), TimeSpan.FromMilliseconds(20)]);
|
||||||
|
var supervisor = new FocasHostSupervisor(launcher, backoff);
|
||||||
|
|
||||||
|
var unavailableMessages = new List<string>();
|
||||||
|
supervisor.OnUnavailable += m => unavailableMessages.Add(m);
|
||||||
|
|
||||||
|
var client = await supervisor.GetOrLaunchAsync(TestContext.Current.CancellationToken);
|
||||||
|
client.ShouldNotBeNull();
|
||||||
|
launcher.LaunchAttempts.ShouldBe(2);
|
||||||
|
unavailableMessages.Count.ShouldBe(1);
|
||||||
|
unavailableMessages[0].ShouldContain("launch-failed");
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public async Task Repeated_launch_failures_open_breaker_and_surface_InvalidOperation()
|
||||||
|
{
|
||||||
|
var launcher = new FakeLauncher();
|
||||||
|
for (var i = 0; i < 10; i++)
|
||||||
|
launcher.Plan.Enqueue(() => throw new InvalidOperationException("simulated host refused"));
|
||||||
|
|
||||||
|
var supervisor = new FocasHostSupervisor(
|
||||||
|
launcher,
|
||||||
|
backoff: new Backoff([TimeSpan.FromMilliseconds(1)]),
|
||||||
|
breaker: new CircuitBreaker { CrashesAllowedPerWindow = 2, Window = TimeSpan.FromMinutes(5) });
|
||||||
|
|
||||||
|
var ex = await Should.ThrowAsync<InvalidOperationException>(async () =>
|
||||||
|
await supervisor.GetOrLaunchAsync(TestContext.Current.CancellationToken));
|
||||||
|
ex.Message.ShouldContain("circuit breaker");
|
||||||
|
supervisor.StickyAlertActive.ShouldBeTrue();
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public async Task NotifyHostDeadAsync_terminates_current_and_fans_out_unavailable()
|
||||||
|
{
|
||||||
|
var launcher = new FakeLauncher();
|
||||||
|
launcher.Plan.Enqueue(() => new StubFocasClient());
|
||||||
|
var supervisor = new FocasHostSupervisor(launcher);
|
||||||
|
|
||||||
|
var messages = new List<string>();
|
||||||
|
supervisor.OnUnavailable += m => messages.Add(m);
|
||||||
|
await supervisor.GetOrLaunchAsync(TestContext.Current.CancellationToken);
|
||||||
|
|
||||||
|
await supervisor.NotifyHostDeadAsync("heartbeat-loss", TestContext.Current.CancellationToken);
|
||||||
|
|
||||||
|
launcher.Terminations.ShouldBe(1);
|
||||||
|
messages.ShouldContain("heartbeat-loss");
|
||||||
|
supervisor.ObservedCrashes.ShouldBe(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public async Task AcknowledgeAndReset_clears_sticky_alert()
|
||||||
|
{
|
||||||
|
var launcher = new FakeLauncher();
|
||||||
|
for (var i = 0; i < 10; i++)
|
||||||
|
launcher.Plan.Enqueue(() => throw new InvalidOperationException("refused"));
|
||||||
|
var supervisor = new FocasHostSupervisor(
|
||||||
|
launcher,
|
||||||
|
backoff: new Backoff([TimeSpan.FromMilliseconds(1)]),
|
||||||
|
breaker: new CircuitBreaker { CrashesAllowedPerWindow = 1 });
|
||||||
|
|
||||||
|
try { await supervisor.GetOrLaunchAsync(TestContext.Current.CancellationToken); } catch { }
|
||||||
|
supervisor.StickyAlertActive.ShouldBeTrue();
|
||||||
|
|
||||||
|
supervisor.AcknowledgeAndReset();
|
||||||
|
supervisor.StickyAlertActive.ShouldBeFalse();
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public async Task Dispose_terminates_host_process()
|
||||||
|
{
|
||||||
|
var launcher = new FakeLauncher();
|
||||||
|
launcher.Plan.Enqueue(() => new StubFocasClient());
|
||||||
|
var supervisor = new FocasHostSupervisor(launcher);
|
||||||
|
await supervisor.GetOrLaunchAsync(TestContext.Current.CancellationToken);
|
||||||
|
|
||||||
|
supervisor.Dispose();
|
||||||
|
launcher.Terminations.ShouldBe(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user