Files
lmxopcua/tests/ZB.MOM.WW.OtOpcUa.Driver.FOCAS.Tests/SupervisorTests.cs
Joseph Doherty 5033609944 FOCAS Tier-C PR D — supervisor + backoff + crash-loop breaker + heartbeat monitor. Fourth of 5 PRs for #220. Ships the resilience harness that sits between the driver's IFocasClient requests and the Tier-C Host process, so a crashing Fwlib32.dll takes down only the Host (not the main server), gets respawned on a backoff ladder, and opens a circuit with a sticky operator alert when the crash rate is pathological. Same shape as Galaxy Tier-C so the Admin /hosts surface has a single mental model. New Supervisor/ namespace in Driver.FOCAS (.NET 10, Proxy-side): Backoff with the 5s→15s→60s default ladder + StableRunThreshold that resets the index after a 2-min clean run (so a one-off crash after hours of steady-state doesn't restart from the top); CircuitBreaker with 3-crashes-in-5-min threshold + escalating 1h→4h→manual-reset cooldown ladder + StickyAlertActive flag that persists across cooldowns until AcknowledgeAndReset is called; HeartbeatMonitor tracking ConsecutiveMisses against the 3-misses-kill threshold + LastAckUtc for telemetry; IHostProcessLauncher abstraction over "spawn Host process + produce an IFocasClient connected to it" so the supervisor stays I/O-free and fully testable with a fake launcher that can be told to throw on specific attempts (production wiring over Process.Start + FocasIpcClient.ConnectAsync is the PR E ops-glue concern); FocasHostSupervisor orchestrating them — GetOrLaunchAsync cycles through backoff until either a client is returned or the breaker opens (surfaced as InvalidOperationException so the driver maps to BadDeviceFailure), NotifyHostDeadAsync fans out the unavailable event + terminates the current launcher + records the crash without blocking (so heartbeat-loss detection can short-circuit subscriber fan-out and let the next GetOrLaunchAsync handle the respawn), AcknowledgeAndReset is the operator-clear path, OnUnavailable event for Admin /hosts wiring + ObservedCrashes + BackoffAttempt + StickyAlertActive for telemetry. 14 new unit tests across SupervisorTests.cs: Backoff (default sequence, clamping, RecordStableRun resets), CircuitBreaker (below threshold allowed, opens at threshold, escalates cooldown after second open, ManualReset clears state), HeartbeatMonitor (3 consecutive misses declares dead, ack resets counter), FocasHostSupervisor (first-launch success, retry-with-backoff after transient failure, repeated failures open breaker + surface InvalidOperationException, NotifyHostDeadAsync terminates + fan-outs + increments crash count, AcknowledgeAndReset clears sticky, Dispose terminates). Full FOCAS driver tests now 186/186 green (172 + 14 new). No changes to IFocasClient DI contract; existing FakeFocasClient-based tests unaffected. PR E wires the real Process-based IHostProcessLauncher + NSSM install scripts + MMF post-mortem + docs.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-20 14:17:23 -04:00

250 lines
9.0 KiB
C#

using Shouldly;
using Xunit;
using ZB.MOM.WW.OtOpcUa.Driver.FOCAS.Supervisor;
namespace ZB.MOM.WW.OtOpcUa.Driver.FOCAS.Tests;
[Trait("Category", "Unit")]
public sealed class BackoffTests
{
[Fact]
public void Default_sequence_is_5s_15s_60s_then_clamped()
{
var b = new Backoff();
b.Next().ShouldBe(TimeSpan.FromSeconds(5));
b.Next().ShouldBe(TimeSpan.FromSeconds(15));
b.Next().ShouldBe(TimeSpan.FromSeconds(60));
b.Next().ShouldBe(TimeSpan.FromSeconds(60));
b.Next().ShouldBe(TimeSpan.FromSeconds(60));
}
[Fact]
public void RecordStableRun_resets_the_ladder_to_the_start()
{
var b = new Backoff();
b.Next(); b.Next();
b.AttemptIndex.ShouldBe(2);
b.RecordStableRun();
b.AttemptIndex.ShouldBe(0);
b.Next().ShouldBe(TimeSpan.FromSeconds(5));
}
}
[Trait("Category", "Unit")]
public sealed class CircuitBreakerTests
{
[Fact]
public void Allows_crashes_below_threshold()
{
var b = new CircuitBreaker();
var now = DateTime.UtcNow;
b.TryRecordCrash(now, out _).ShouldBeTrue();
b.TryRecordCrash(now.AddSeconds(1), out _).ShouldBeTrue();
b.TryRecordCrash(now.AddSeconds(2), out _).ShouldBeTrue();
b.StickyAlertActive.ShouldBeFalse();
}
[Fact]
public void Opens_when_exceeding_threshold_in_window()
{
var b = new CircuitBreaker();
var now = DateTime.UtcNow;
b.TryRecordCrash(now, out _);
b.TryRecordCrash(now.AddSeconds(1), out _);
b.TryRecordCrash(now.AddSeconds(2), out _);
b.TryRecordCrash(now.AddSeconds(3), out var cooldown).ShouldBeFalse();
cooldown.ShouldBe(TimeSpan.FromHours(1));
b.StickyAlertActive.ShouldBeTrue();
}
[Fact]
public void Escalates_cooldown_after_second_open()
{
var b = new CircuitBreaker();
var t0 = new DateTime(2026, 1, 1, 0, 0, 0, DateTimeKind.Utc);
// First burst — 4 crashes opens breaker with 1h cooldown.
for (var i = 0; i < 4; i++) b.TryRecordCrash(t0.AddSeconds(i), out _);
b.StickyAlertActive.ShouldBeTrue();
// Wait past cooldown. The first crash after cooldown-elapsed resets _openSinceUtc and
// bumps escalation level; the next 3 crashes then re-open with the escalated 4h cooldown.
b.TryRecordCrash(t0.AddHours(1).AddMinutes(1), out _);
var t1 = t0.AddHours(1).AddMinutes(1).AddSeconds(1);
b.TryRecordCrash(t1, out _);
b.TryRecordCrash(t1.AddSeconds(1), out _);
b.TryRecordCrash(t1.AddSeconds(2), out var cooldown).ShouldBeFalse();
cooldown.ShouldBe(TimeSpan.FromHours(4));
}
[Fact]
public void ManualReset_clears_everything()
{
var b = new CircuitBreaker();
var now = DateTime.UtcNow;
for (var i = 0; i < 5; i++) b.TryRecordCrash(now.AddSeconds(i), out _);
b.StickyAlertActive.ShouldBeTrue();
b.ManualReset();
b.StickyAlertActive.ShouldBeFalse();
b.TryRecordCrash(now.AddSeconds(10), out _).ShouldBeTrue();
}
}
[Trait("Category", "Unit")]
public sealed class HeartbeatMonitorTests
{
[Fact]
public void Three_consecutive_misses_declares_dead()
{
var m = new HeartbeatMonitor();
m.RecordMiss().ShouldBeFalse();
m.RecordMiss().ShouldBeFalse();
m.RecordMiss().ShouldBeTrue();
}
[Fact]
public void Ack_resets_the_miss_counter()
{
var m = new HeartbeatMonitor();
m.RecordMiss(); m.RecordMiss();
m.ConsecutiveMisses.ShouldBe(2);
m.RecordAck(DateTime.UtcNow);
m.ConsecutiveMisses.ShouldBe(0);
}
}
[Trait("Category", "Unit")]
public sealed class FocasHostSupervisorTests
{
private sealed class FakeLauncher : IHostProcessLauncher
{
public int LaunchAttempts { get; private set; }
public int Terminations { get; private set; }
public Queue<Func<IFocasClient>> Plan { get; } = new();
public bool IsProcessAlive { get; set; }
public Task<IFocasClient> LaunchAsync(CancellationToken ct)
{
LaunchAttempts++;
if (Plan.Count == 0) throw new InvalidOperationException("FakeLauncher plan exhausted");
var next = Plan.Dequeue()();
IsProcessAlive = true;
return Task.FromResult(next);
}
public Task TerminateAsync(CancellationToken ct)
{
Terminations++;
IsProcessAlive = false;
return Task.CompletedTask;
}
}
private sealed class StubFocasClient : IFocasClient
{
public bool IsConnected => true;
public Task ConnectAsync(FocasHostAddress address, TimeSpan timeout, CancellationToken ct) => Task.CompletedTask;
public Task<(object? value, uint status)> ReadAsync(FocasAddress a, FocasDataType t, CancellationToken ct) =>
Task.FromResult<(object?, uint)>((0, 0));
public Task<uint> WriteAsync(FocasAddress a, FocasDataType t, object? v, CancellationToken ct) => Task.FromResult(0u);
public Task<bool> ProbeAsync(CancellationToken ct) => Task.FromResult(true);
public void Dispose() { }
}
[Fact]
public async Task GetOrLaunch_returns_client_on_first_success()
{
var launcher = new FakeLauncher();
launcher.Plan.Enqueue(() => new StubFocasClient());
var supervisor = new FocasHostSupervisor(launcher);
var client = await supervisor.GetOrLaunchAsync(TestContext.Current.CancellationToken);
client.ShouldNotBeNull();
launcher.LaunchAttempts.ShouldBe(1);
}
[Fact]
public async Task GetOrLaunch_retries_after_transient_failure_with_backoff()
{
var launcher = new FakeLauncher();
launcher.Plan.Enqueue(() => throw new TimeoutException("pipe not ready"));
launcher.Plan.Enqueue(() => new StubFocasClient());
var backoff = new Backoff([TimeSpan.FromMilliseconds(10), TimeSpan.FromMilliseconds(20)]);
var supervisor = new FocasHostSupervisor(launcher, backoff);
var unavailableMessages = new List<string>();
supervisor.OnUnavailable += m => unavailableMessages.Add(m);
var client = await supervisor.GetOrLaunchAsync(TestContext.Current.CancellationToken);
client.ShouldNotBeNull();
launcher.LaunchAttempts.ShouldBe(2);
unavailableMessages.Count.ShouldBe(1);
unavailableMessages[0].ShouldContain("launch-failed");
}
[Fact]
public async Task Repeated_launch_failures_open_breaker_and_surface_InvalidOperation()
{
var launcher = new FakeLauncher();
for (var i = 0; i < 10; i++)
launcher.Plan.Enqueue(() => throw new InvalidOperationException("simulated host refused"));
var supervisor = new FocasHostSupervisor(
launcher,
backoff: new Backoff([TimeSpan.FromMilliseconds(1)]),
breaker: new CircuitBreaker { CrashesAllowedPerWindow = 2, Window = TimeSpan.FromMinutes(5) });
var ex = await Should.ThrowAsync<InvalidOperationException>(async () =>
await supervisor.GetOrLaunchAsync(TestContext.Current.CancellationToken));
ex.Message.ShouldContain("circuit breaker");
supervisor.StickyAlertActive.ShouldBeTrue();
}
[Fact]
public async Task NotifyHostDeadAsync_terminates_current_and_fans_out_unavailable()
{
var launcher = new FakeLauncher();
launcher.Plan.Enqueue(() => new StubFocasClient());
var supervisor = new FocasHostSupervisor(launcher);
var messages = new List<string>();
supervisor.OnUnavailable += m => messages.Add(m);
await supervisor.GetOrLaunchAsync(TestContext.Current.CancellationToken);
await supervisor.NotifyHostDeadAsync("heartbeat-loss", TestContext.Current.CancellationToken);
launcher.Terminations.ShouldBe(1);
messages.ShouldContain("heartbeat-loss");
supervisor.ObservedCrashes.ShouldBe(1);
}
[Fact]
public async Task AcknowledgeAndReset_clears_sticky_alert()
{
var launcher = new FakeLauncher();
for (var i = 0; i < 10; i++)
launcher.Plan.Enqueue(() => throw new InvalidOperationException("refused"));
var supervisor = new FocasHostSupervisor(
launcher,
backoff: new Backoff([TimeSpan.FromMilliseconds(1)]),
breaker: new CircuitBreaker { CrashesAllowedPerWindow = 1 });
try { await supervisor.GetOrLaunchAsync(TestContext.Current.CancellationToken); } catch { }
supervisor.StickyAlertActive.ShouldBeTrue();
supervisor.AcknowledgeAndReset();
supervisor.StickyAlertActive.ShouldBeFalse();
}
[Fact]
public async Task Dispose_terminates_host_process()
{
var launcher = new FakeLauncher();
launcher.Plan.Enqueue(() => new StubFocasClient());
var supervisor = new FocasHostSupervisor(launcher);
await supervisor.GetOrLaunchAsync(TestContext.Current.CancellationToken);
supervisor.Dispose();
launcher.Terminations.ShouldBe(1);
}
}