namespace ZB.MOM.WW.OtOpcUa.Driver.FOCAS.Supervisor;
///
/// Ties + +
/// + into one object the
/// driver asks for IFocasClients. On a detected crash (process exit or
/// heartbeat loss) the supervisor fans out BadCommunicationError to all
/// subscribers via the callback, then respawns with
/// backoff unless the breaker is open.
///
///
/// The supervisor itself is I/O-free — it doesn't know how to spawn processes, probe
/// pipes, or send heartbeats. Production wires the concrete
/// over FocasIpcClient + Process;
/// tests drive the same state machine with a deterministic launcher stub.
///
public sealed class FocasHostSupervisor : IDisposable
{
private readonly IHostProcessLauncher _launcher;
private readonly Backoff _backoff;
private readonly CircuitBreaker _breaker;
private readonly Func _clock;
private IFocasClient? _current;
private DateTime _currentStartedUtc;
private bool _disposed;
public FocasHostSupervisor(
IHostProcessLauncher launcher,
Backoff? backoff = null,
CircuitBreaker? breaker = null,
Func? clock = null)
{
_launcher = launcher ?? throw new ArgumentNullException(nameof(launcher));
_backoff = backoff ?? new Backoff();
_breaker = breaker ?? new CircuitBreaker();
_clock = clock ?? (() => DateTime.UtcNow);
}
/// Raised with a short reason string whenever the Host goes unavailable (crash / heartbeat loss / breaker-open).
public event Action? OnUnavailable;
/// Crash count observed in the current process lifetime. Exposed for /hosts Admin telemetry.
public int ObservedCrashes { get; private set; }
/// true if the crash-loop breaker has latched a sticky alert that needs operator reset.
public bool StickyAlertActive => _breaker.StickyAlertActive;
public int BackoffAttempt => _backoff.AttemptIndex;
///
/// Returns the current live client. If none, tries to launch — applying the
/// backoff schedule between attempts and stopping once the breaker opens.
///
public async Task GetOrLaunchAsync(CancellationToken ct)
{
ThrowIfDisposed();
if (_current is not null && _launcher.IsProcessAlive) return _current;
return await LaunchWithBackoffAsync(ct).ConfigureAwait(false);
}
///
/// Called by the heartbeat task each time a miss threshold is crossed.
/// Treated as a crash: fan out Bad status + attempt respawn.
///
public async Task NotifyHostDeadAsync(string reason, CancellationToken ct)
{
ThrowIfDisposed();
OnUnavailable?.Invoke(reason);
ObservedCrashes++;
try { await _launcher.TerminateAsync(ct).ConfigureAwait(false); }
catch { /* best effort */ }
_current?.Dispose();
_current = null;
if (!_breaker.TryRecordCrash(_clock(), out var cooldown))
{
OnUnavailable?.Invoke(cooldown == TimeSpan.MaxValue
? "circuit-breaker-open-manual-reset-required"
: $"circuit-breaker-open-cooldown-{cooldown:g}");
return;
}
// Successful crash recording — do not respawn synchronously; GetOrLaunchAsync will
// pick up the attempt on the next call. Keeps the fan-out fast.
}
/// Operator action — clear the sticky alert + reset the breaker.
public void AcknowledgeAndReset()
{
_breaker.ManualReset();
_backoff.RecordStableRun();
}
private async Task LaunchWithBackoffAsync(CancellationToken ct)
{
while (true)
{
if (_breaker.StickyAlertActive)
{
if (!_breaker.TryRecordCrash(_clock(), out var cooldown) && cooldown == TimeSpan.MaxValue)
throw new InvalidOperationException(
"FOCAS Host circuit breaker is open and awaiting manual reset. " +
"See Admin /hosts; call AcknowledgeAndReset after investigating the Host log.");
}
try
{
_current = await _launcher.LaunchAsync(ct).ConfigureAwait(false);
_currentStartedUtc = _clock();
// If the launch sequence itself takes long enough to count as a stable run,
// reset the backoff ladder immediately.
if (_clock() - _currentStartedUtc >= _backoff.StableRunThreshold)
_backoff.RecordStableRun();
return _current;
}
catch (Exception ex) when (ex is not OperationCanceledException)
{
OnUnavailable?.Invoke($"launch-failed: {ex.Message}");
ObservedCrashes++;
if (!_breaker.TryRecordCrash(_clock(), out var cooldown))
{
var hint = cooldown == TimeSpan.MaxValue
? "manual reset required"
: $"cooldown {cooldown:g}";
throw new InvalidOperationException(
$"FOCAS Host circuit breaker opened after {ObservedCrashes} crashes — {hint}.", ex);
}
var delay = _backoff.Next();
await Task.Delay(delay, ct).ConfigureAwait(false);
}
}
}
/// Called from the heartbeat loop after a successful ack run — relaxes the backoff ladder.
public void NotifyStableRun()
{
if (_current is null) return;
if (_clock() - _currentStartedUtc >= _backoff.StableRunThreshold)
_backoff.RecordStableRun();
}
public void Dispose()
{
if (_disposed) return;
_disposed = true;
try { _launcher.TerminateAsync(CancellationToken.None).GetAwaiter().GetResult(); }
catch { /* best effort */ }
_current?.Dispose();
_current = null;
}
private void ThrowIfDisposed()
{
if (_disposed) throw new ObjectDisposedException(nameof(FocasHostSupervisor));
}
}