160 lines
6.3 KiB
C#
160 lines
6.3 KiB
C#
namespace ZB.MOM.WW.OtOpcUa.Driver.FOCAS.Supervisor;
|
|
|
|
/// <summary>
|
|
/// Ties <see cref="IHostProcessLauncher"/> + <see cref="Backoff"/> +
|
|
/// <see cref="CircuitBreaker"/> + <see cref="HeartbeatMonitor"/> into one object the
|
|
/// driver asks for <c>IFocasClient</c>s. On a detected crash (process exit or
|
|
/// heartbeat loss) the supervisor fans out <c>BadCommunicationError</c> to all
|
|
/// subscribers via the <see cref="OnUnavailable"/> callback, then respawns with
|
|
/// backoff unless the breaker is open.
|
|
/// </summary>
|
|
/// <remarks>
|
|
/// The supervisor itself is I/O-free — it doesn't know how to spawn processes, probe
|
|
/// pipes, or send heartbeats. Production wires the concrete
|
|
/// <see cref="IHostProcessLauncher"/> over <c>FocasIpcClient</c> + <c>Process</c>;
|
|
/// tests drive the same state machine with a deterministic launcher stub.
|
|
/// </remarks>
|
|
public sealed class FocasHostSupervisor : IDisposable
|
|
{
|
|
private readonly IHostProcessLauncher _launcher;
|
|
private readonly Backoff _backoff;
|
|
private readonly CircuitBreaker _breaker;
|
|
private readonly Func<DateTime> _clock;
|
|
private IFocasClient? _current;
|
|
private DateTime _currentStartedUtc;
|
|
private bool _disposed;
|
|
|
|
public FocasHostSupervisor(
|
|
IHostProcessLauncher launcher,
|
|
Backoff? backoff = null,
|
|
CircuitBreaker? breaker = null,
|
|
Func<DateTime>? clock = null)
|
|
{
|
|
_launcher = launcher ?? throw new ArgumentNullException(nameof(launcher));
|
|
_backoff = backoff ?? new Backoff();
|
|
_breaker = breaker ?? new CircuitBreaker();
|
|
_clock = clock ?? (() => DateTime.UtcNow);
|
|
}
|
|
|
|
/// <summary>Raised with a short reason string whenever the Host goes unavailable (crash / heartbeat loss / breaker-open).</summary>
|
|
public event Action<string>? OnUnavailable;
|
|
|
|
/// <summary>Crash count observed in the current process lifetime. Exposed for /hosts Admin telemetry.</summary>
|
|
public int ObservedCrashes { get; private set; }
|
|
|
|
/// <summary><c>true</c> if the crash-loop breaker has latched a sticky alert that needs operator reset.</summary>
|
|
public bool StickyAlertActive => _breaker.StickyAlertActive;
|
|
|
|
public int BackoffAttempt => _backoff.AttemptIndex;
|
|
|
|
/// <summary>
|
|
/// Returns the current live client. If none, tries to launch — applying the
|
|
/// backoff schedule between attempts and stopping once the breaker opens.
|
|
/// </summary>
|
|
public async Task<IFocasClient> GetOrLaunchAsync(CancellationToken ct)
|
|
{
|
|
ThrowIfDisposed();
|
|
if (_current is not null && _launcher.IsProcessAlive) return _current;
|
|
|
|
return await LaunchWithBackoffAsync(ct).ConfigureAwait(false);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Called by the heartbeat task each time a miss threshold is crossed.
|
|
/// Treated as a crash: fan out Bad status + attempt respawn.
|
|
/// </summary>
|
|
public async Task NotifyHostDeadAsync(string reason, CancellationToken ct)
|
|
{
|
|
ThrowIfDisposed();
|
|
OnUnavailable?.Invoke(reason);
|
|
ObservedCrashes++;
|
|
try { await _launcher.TerminateAsync(ct).ConfigureAwait(false); }
|
|
catch { /* best effort */ }
|
|
_current?.Dispose();
|
|
_current = null;
|
|
|
|
if (!_breaker.TryRecordCrash(_clock(), out var cooldown))
|
|
{
|
|
OnUnavailable?.Invoke(cooldown == TimeSpan.MaxValue
|
|
? "circuit-breaker-open-manual-reset-required"
|
|
: $"circuit-breaker-open-cooldown-{cooldown:g}");
|
|
return;
|
|
}
|
|
// Successful crash recording — do not respawn synchronously; GetOrLaunchAsync will
|
|
// pick up the attempt on the next call. Keeps the fan-out fast.
|
|
}
|
|
|
|
/// <summary>Operator action — clear the sticky alert + reset the breaker.</summary>
|
|
public void AcknowledgeAndReset()
|
|
{
|
|
_breaker.ManualReset();
|
|
_backoff.RecordStableRun();
|
|
}
|
|
|
|
private async Task<IFocasClient> LaunchWithBackoffAsync(CancellationToken ct)
|
|
{
|
|
while (true)
|
|
{
|
|
if (_breaker.StickyAlertActive)
|
|
{
|
|
if (!_breaker.TryRecordCrash(_clock(), out var cooldown) && cooldown == TimeSpan.MaxValue)
|
|
throw new InvalidOperationException(
|
|
"FOCAS Host circuit breaker is open and awaiting manual reset. " +
|
|
"See Admin /hosts; call AcknowledgeAndReset after investigating the Host log.");
|
|
}
|
|
|
|
try
|
|
{
|
|
_current = await _launcher.LaunchAsync(ct).ConfigureAwait(false);
|
|
_currentStartedUtc = _clock();
|
|
|
|
// If the launch sequence itself takes long enough to count as a stable run,
|
|
// reset the backoff ladder immediately.
|
|
if (_clock() - _currentStartedUtc >= _backoff.StableRunThreshold)
|
|
_backoff.RecordStableRun();
|
|
|
|
return _current;
|
|
}
|
|
catch (Exception ex) when (ex is not OperationCanceledException)
|
|
{
|
|
OnUnavailable?.Invoke($"launch-failed: {ex.Message}");
|
|
ObservedCrashes++;
|
|
if (!_breaker.TryRecordCrash(_clock(), out var cooldown))
|
|
{
|
|
var hint = cooldown == TimeSpan.MaxValue
|
|
? "manual reset required"
|
|
: $"cooldown {cooldown:g}";
|
|
throw new InvalidOperationException(
|
|
$"FOCAS Host circuit breaker opened after {ObservedCrashes} crashes — {hint}.", ex);
|
|
}
|
|
|
|
var delay = _backoff.Next();
|
|
await Task.Delay(delay, ct).ConfigureAwait(false);
|
|
}
|
|
}
|
|
}
|
|
|
|
/// <summary>Called from the heartbeat loop after a successful ack run — relaxes the backoff ladder.</summary>
|
|
public void NotifyStableRun()
|
|
{
|
|
if (_current is null) return;
|
|
if (_clock() - _currentStartedUtc >= _backoff.StableRunThreshold)
|
|
_backoff.RecordStableRun();
|
|
}
|
|
|
|
public void Dispose()
|
|
{
|
|
if (_disposed) return;
|
|
_disposed = true;
|
|
try { _launcher.TerminateAsync(CancellationToken.None).GetAwaiter().GetResult(); }
|
|
catch { /* best effort */ }
|
|
_current?.Dispose();
|
|
_current = null;
|
|
}
|
|
|
|
private void ThrowIfDisposed()
|
|
{
|
|
if (_disposed) throw new ObjectDisposedException(nameof(FocasHostSupervisor));
|
|
}
|
|
}
|