namespace ZB.MOM.WW.OtOpcUa.Driver.FOCAS.Supervisor; /// /// Ties + + /// + into one object the /// driver asks for IFocasClients. On a detected crash (process exit or /// heartbeat loss) the supervisor fans out BadCommunicationError to all /// subscribers via the callback, then respawns with /// backoff unless the breaker is open. /// /// /// The supervisor itself is I/O-free — it doesn't know how to spawn processes, probe /// pipes, or send heartbeats. Production wires the concrete /// over FocasIpcClient + Process; /// tests drive the same state machine with a deterministic launcher stub. /// public sealed class FocasHostSupervisor : IDisposable { private readonly IHostProcessLauncher _launcher; private readonly Backoff _backoff; private readonly CircuitBreaker _breaker; private readonly Func _clock; private IFocasClient? _current; private DateTime _currentStartedUtc; private bool _disposed; public FocasHostSupervisor( IHostProcessLauncher launcher, Backoff? backoff = null, CircuitBreaker? breaker = null, Func? clock = null) { _launcher = launcher ?? throw new ArgumentNullException(nameof(launcher)); _backoff = backoff ?? new Backoff(); _breaker = breaker ?? new CircuitBreaker(); _clock = clock ?? (() => DateTime.UtcNow); } /// Raised with a short reason string whenever the Host goes unavailable (crash / heartbeat loss / breaker-open). public event Action? OnUnavailable; /// Crash count observed in the current process lifetime. Exposed for /hosts Admin telemetry. public int ObservedCrashes { get; private set; } /// true if the crash-loop breaker has latched a sticky alert that needs operator reset. public bool StickyAlertActive => _breaker.StickyAlertActive; public int BackoffAttempt => _backoff.AttemptIndex; /// /// Returns the current live client. If none, tries to launch — applying the /// backoff schedule between attempts and stopping once the breaker opens. /// public async Task GetOrLaunchAsync(CancellationToken ct) { ThrowIfDisposed(); if (_current is not null && _launcher.IsProcessAlive) return _current; return await LaunchWithBackoffAsync(ct).ConfigureAwait(false); } /// /// Called by the heartbeat task each time a miss threshold is crossed. /// Treated as a crash: fan out Bad status + attempt respawn. /// public async Task NotifyHostDeadAsync(string reason, CancellationToken ct) { ThrowIfDisposed(); OnUnavailable?.Invoke(reason); ObservedCrashes++; try { await _launcher.TerminateAsync(ct).ConfigureAwait(false); } catch { /* best effort */ } _current?.Dispose(); _current = null; if (!_breaker.TryRecordCrash(_clock(), out var cooldown)) { OnUnavailable?.Invoke(cooldown == TimeSpan.MaxValue ? "circuit-breaker-open-manual-reset-required" : $"circuit-breaker-open-cooldown-{cooldown:g}"); return; } // Successful crash recording — do not respawn synchronously; GetOrLaunchAsync will // pick up the attempt on the next call. Keeps the fan-out fast. } /// Operator action — clear the sticky alert + reset the breaker. public void AcknowledgeAndReset() { _breaker.ManualReset(); _backoff.RecordStableRun(); } private async Task LaunchWithBackoffAsync(CancellationToken ct) { while (true) { if (_breaker.StickyAlertActive) { if (!_breaker.TryRecordCrash(_clock(), out var cooldown) && cooldown == TimeSpan.MaxValue) throw new InvalidOperationException( "FOCAS Host circuit breaker is open and awaiting manual reset. " + "See Admin /hosts; call AcknowledgeAndReset after investigating the Host log."); } try { _current = await _launcher.LaunchAsync(ct).ConfigureAwait(false); _currentStartedUtc = _clock(); // If the launch sequence itself takes long enough to count as a stable run, // reset the backoff ladder immediately. if (_clock() - _currentStartedUtc >= _backoff.StableRunThreshold) _backoff.RecordStableRun(); return _current; } catch (Exception ex) when (ex is not OperationCanceledException) { OnUnavailable?.Invoke($"launch-failed: {ex.Message}"); ObservedCrashes++; if (!_breaker.TryRecordCrash(_clock(), out var cooldown)) { var hint = cooldown == TimeSpan.MaxValue ? "manual reset required" : $"cooldown {cooldown:g}"; throw new InvalidOperationException( $"FOCAS Host circuit breaker opened after {ObservedCrashes} crashes — {hint}.", ex); } var delay = _backoff.Next(); await Task.Delay(delay, ct).ConfigureAwait(false); } } } /// Called from the heartbeat loop after a successful ack run — relaxes the backoff ladder. public void NotifyStableRun() { if (_current is null) return; if (_clock() - _currentStartedUtc >= _backoff.StableRunThreshold) _backoff.RecordStableRun(); } public void Dispose() { if (_disposed) return; _disposed = true; try { _launcher.TerminateAsync(CancellationToken.None).GetAwaiter().GetResult(); } catch { /* best effort */ } _current?.Dispose(); _current = null; } private void ThrowIfDisposed() { if (_disposed) throw new ObjectDisposedException(nameof(FocasHostSupervisor)); } }