PR 4.5 — ReconnectSupervisor
State machine that drives GalaxyDriver's recovery from gw transport failure. Healthy → TransportLost → Reopening → Replaying → Healthy. Drivers report failure signals; the supervisor runs reopen + replay with capped exponential backoff (default 500ms → 30s) until both succeed. Files: - Runtime/ReconnectSupervisor.cs — state machine with snapshot, change event, last-error tracking, and a one-attempt-at-a-time recovery loop. Idempotent ReportTransportFailure: repeated failure reports during an in-flight recovery do not spawn parallel loops. Reopen + replay are caller-supplied callbacks (the driver injects them in the wire-up PR); reopen re-Registers the gw session, replay re-establishes every active subscription via gw's ReplaySubscriptionsCommand (mxaccessgw issue gw-3) or the SubscribeBulk fallback. Dispose cancels the loop cleanly. - Public StateTransition record + IsDegraded predicate the driver maps to DriverState.Degraded for health snapshots. Wiring (GalaxyDriver subscribes the supervisor to its EventPump's transport-failure signal, exposes IsDegraded through GetHealth(), routes reopen/replay callbacks through GalaxyMxSession + SubscriptionRegistry) lands in PR 4.W to avoid conflict with the parallel host-probe track (PR 4.7) and align the wire-up with the rest of Phase 4's plumbing. 9 supervisor tests (full state-machine traversal, retry-until-success on both reopen and replay failures, idempotent failure reports, last-error propagation, Dispose mid-recovery, post-dispose throws, fast-path Healthy WaitForHealthy). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,268 @@
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Logging.Abstractions;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Runtime;
|
||||
|
||||
/// <summary>
|
||||
/// Coordinates GalaxyDriver's recovery from gateway transport failure. Drives a
|
||||
/// state machine — <c>Healthy → TransportLost → Reopening → Replaying → Healthy</c>
|
||||
/// — and exposes the current state through a snapshot + change event so the
|
||||
/// driver's <c>DriverHealth</c> reflects <c>Degraded</c> while we're not in
|
||||
/// <c>Healthy</c>.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// The supervisor doesn't own the session, the subscription registry, or the
|
||||
/// event pump. It receives transport-failure signals from the rest of the
|
||||
/// driver (EventPump throws, a gw RPC raises, the heartbeat times out), runs
|
||||
/// a one-attempt-at-a-time recovery loop, and lets the rest of the driver
|
||||
/// continue serving cached state during recovery.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// <b>Reopen</b>: caller-supplied callback that re-opens the gw session +
|
||||
/// re-Registers the MXAccess client. Throws on failure.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// <b>Replay</b>: caller-supplied callback that re-establishes every active
|
||||
/// subscription. Production wraps gw's <c>ReplaySubscriptionsCommand</c>
|
||||
/// (mxaccessgw issue #0.3); when that's not available, the callback falls
|
||||
/// back to walking the SubscriptionRegistry and re-issuing SubscribeBulk for
|
||||
/// every tracked tag.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// Backoff is capped exponential — first retry after
|
||||
/// <see cref="ReconnectOptions.InitialBackoff"/>, doubled per failed attempt,
|
||||
/// capped at <see cref="ReconnectOptions.MaxBackoff"/>. Persistent failures
|
||||
/// hold the supervisor in <c>Reopening</c> indefinitely; the supervisor never
|
||||
/// gives up on its own — operators / Phase 6.4 soak handle that policy.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public sealed class ReconnectSupervisor : IDisposable
|
||||
{
|
||||
/// <summary>Recovery state machine.</summary>
|
||||
public enum State
|
||||
{
|
||||
Healthy,
|
||||
TransportLost,
|
||||
Reopening,
|
||||
Replaying,
|
||||
}
|
||||
|
||||
private readonly Func<CancellationToken, Task> _reopen;
|
||||
private readonly Func<CancellationToken, Task> _replay;
|
||||
private readonly ReconnectOptions _options;
|
||||
private readonly ILogger _logger;
|
||||
private readonly Func<int, TimeSpan, TimeSpan, TimeSpan>? _backoffDelay;
|
||||
|
||||
private readonly Lock _stateLock = new();
|
||||
private State _state = State.Healthy;
|
||||
private string? _lastError;
|
||||
private DateTime? _lastTransitionUtc;
|
||||
|
||||
private Task? _recoveryLoop;
|
||||
private CancellationTokenSource? _loopCts;
|
||||
private bool _disposed;
|
||||
|
||||
/// <summary>Fires after every state transition.</summary>
|
||||
public event EventHandler<StateTransition>? StateChanged;
|
||||
|
||||
public ReconnectSupervisor(
|
||||
Func<CancellationToken, Task> reopen,
|
||||
Func<CancellationToken, Task> replay,
|
||||
ReconnectOptions? options = null,
|
||||
ILogger? logger = null,
|
||||
Func<int, TimeSpan, TimeSpan, TimeSpan>? backoffDelay = null)
|
||||
{
|
||||
_reopen = reopen ?? throw new ArgumentNullException(nameof(reopen));
|
||||
_replay = replay ?? throw new ArgumentNullException(nameof(replay));
|
||||
_options = options ?? new ReconnectOptions();
|
||||
_logger = logger ?? NullLogger.Instance;
|
||||
_backoffDelay = backoffDelay;
|
||||
}
|
||||
|
||||
/// <summary>Current state. Healthy = fully recovered + subscriptions live.</summary>
|
||||
public State CurrentState
|
||||
{
|
||||
get { lock (_stateLock) return _state; }
|
||||
}
|
||||
|
||||
/// <summary>True when CurrentState != Healthy. Drivers map this to DriverState.Degraded.</summary>
|
||||
public bool IsDegraded
|
||||
{
|
||||
get { lock (_stateLock) return _state != State.Healthy; }
|
||||
}
|
||||
|
||||
public string? LastError
|
||||
{
|
||||
get { lock (_stateLock) return _lastError; }
|
||||
}
|
||||
|
||||
public DateTime? LastTransitionUtc
|
||||
{
|
||||
get { lock (_stateLock) return _lastTransitionUtc; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Notify the supervisor that a gw transport failure has been observed. Idempotent —
|
||||
/// repeated calls during an in-flight recovery do not start a parallel loop. The
|
||||
/// first call spawns a background task that drives reopen → replay until it
|
||||
/// succeeds or <see cref="Dispose"/> cancels it.
|
||||
/// </summary>
|
||||
public void ReportTransportFailure(Exception cause)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(cause);
|
||||
ObjectDisposedException.ThrowIf(_disposed, this);
|
||||
|
||||
lock (_stateLock)
|
||||
{
|
||||
_lastError = cause.Message;
|
||||
if (_state != State.Healthy)
|
||||
{
|
||||
// Already recovering — nothing else to do.
|
||||
_logger.LogDebug("Transport failure reported during {State}: {Message}", _state, cause.Message);
|
||||
return;
|
||||
}
|
||||
|
||||
TransitionLocked(State.TransportLost, cause.Message);
|
||||
|
||||
_loopCts = new CancellationTokenSource();
|
||||
_recoveryLoop = Task.Run(() => RecoveryLoopAsync(_loopCts.Token));
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Wait until the current recovery cycle reaches Healthy or the supplied token
|
||||
/// is cancelled. Returns immediately when already Healthy. Useful for tests
|
||||
/// and for orchestration that wants to gate calls on recovery completing.
|
||||
/// </summary>
|
||||
public async Task WaitForHealthyAsync(CancellationToken cancellationToken)
|
||||
{
|
||||
while (!cancellationToken.IsCancellationRequested && IsDegraded)
|
||||
{
|
||||
await Task.Delay(50, cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
|
||||
private async Task RecoveryLoopAsync(CancellationToken ct)
|
||||
{
|
||||
var attempt = 0;
|
||||
while (!ct.IsCancellationRequested)
|
||||
{
|
||||
attempt++;
|
||||
if (attempt > 1)
|
||||
{
|
||||
var delay = ComputeBackoff(attempt);
|
||||
_logger.LogInformation(
|
||||
"Galaxy reconnect attempt {Attempt} — waiting {Delay} before retry", attempt, delay);
|
||||
try { await Task.Delay(delay, ct).ConfigureAwait(false); }
|
||||
catch (OperationCanceledException) { return; }
|
||||
}
|
||||
|
||||
// === Reopening phase ===
|
||||
lock (_stateLock) TransitionLocked(State.Reopening, _lastError);
|
||||
|
||||
try
|
||||
{
|
||||
await _reopen(ct).ConfigureAwait(false);
|
||||
}
|
||||
catch (OperationCanceledException) when (ct.IsCancellationRequested) { return; }
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(ex, "Galaxy reopen failed (attempt {Attempt})", attempt);
|
||||
lock (_stateLock) { _lastError = ex.Message; }
|
||||
continue; // back to backoff + retry
|
||||
}
|
||||
|
||||
// === Replaying phase ===
|
||||
lock (_stateLock) TransitionLocked(State.Replaying, _lastError);
|
||||
|
||||
try
|
||||
{
|
||||
await _replay(ct).ConfigureAwait(false);
|
||||
}
|
||||
catch (OperationCanceledException) when (ct.IsCancellationRequested) { return; }
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(ex, "Galaxy replay failed (attempt {Attempt})", attempt);
|
||||
lock (_stateLock) { _lastError = ex.Message; }
|
||||
continue; // back to backoff + retry
|
||||
}
|
||||
|
||||
// === Done ===
|
||||
lock (_stateLock)
|
||||
{
|
||||
_lastError = null;
|
||||
TransitionLocked(State.Healthy, null);
|
||||
}
|
||||
_logger.LogInformation("Galaxy reconnect succeeded after {Attempt} attempt(s)", attempt);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
private TimeSpan ComputeBackoff(int attempt)
|
||||
{
|
||||
if (_backoffDelay is not null)
|
||||
return _backoffDelay(attempt, _options.InitialBackoff, _options.MaxBackoff);
|
||||
|
||||
// Standard capped exponential — InitialBackoff * 2^(attempt-2), capped at MaxBackoff.
|
||||
// Attempt 2 → InitialBackoff, attempt 3 → 2x, attempt 4 → 4x, etc.
|
||||
var multiplier = Math.Min(1L << Math.Max(0, attempt - 2), int.MaxValue);
|
||||
var ticks = _options.InitialBackoff.Ticks * multiplier;
|
||||
if (ticks <= 0 || ticks > _options.MaxBackoff.Ticks) ticks = _options.MaxBackoff.Ticks;
|
||||
return TimeSpan.FromTicks(ticks);
|
||||
}
|
||||
|
||||
private void TransitionLocked(State next, string? cause)
|
||||
{
|
||||
if (next == _state) return;
|
||||
var previous = _state;
|
||||
_state = next;
|
||||
_lastTransitionUtc = DateTime.UtcNow;
|
||||
var transition = new StateTransition(previous, next, cause, _lastTransitionUtc.Value);
|
||||
try { StateChanged?.Invoke(this, transition); }
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(ex,
|
||||
"Galaxy reconnect StateChanged handler threw — continuing.");
|
||||
}
|
||||
}
|
||||
|
||||
public void Dispose()
|
||||
{
|
||||
if (_disposed) return;
|
||||
_disposed = true;
|
||||
|
||||
CancellationTokenSource? cts;
|
||||
Task? loop;
|
||||
lock (_stateLock) { cts = _loopCts; loop = _recoveryLoop; _loopCts = null; _recoveryLoop = null; }
|
||||
|
||||
cts?.Cancel();
|
||||
if (loop is not null)
|
||||
{
|
||||
try { loop.GetAwaiter().GetResult(); } catch { /* shutdown */ }
|
||||
}
|
||||
cts?.Dispose();
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// One state transition observed by the supervisor.
|
||||
/// </summary>
|
||||
public sealed record StateTransition(
|
||||
ReconnectSupervisor.State Previous,
|
||||
ReconnectSupervisor.State Next,
|
||||
string? Cause,
|
||||
DateTime AtUtc);
|
||||
|
||||
/// <summary>
|
||||
/// Knobs for the supervisor's backoff. <see cref="ReconnectOptions"/> on the driver
|
||||
/// options record (PR 4.0) maps onto this — they're separate types so the supervisor
|
||||
/// can be exercised in tests without the full driver options surface.
|
||||
/// </summary>
|
||||
public sealed record ReconnectOptions(
|
||||
TimeSpan? InitialBackoffOverride = null,
|
||||
TimeSpan? MaxBackoffOverride = null)
|
||||
{
|
||||
public TimeSpan InitialBackoff => InitialBackoffOverride ?? TimeSpan.FromMilliseconds(500);
|
||||
public TimeSpan MaxBackoff => MaxBackoffOverride ?? TimeSpan.FromSeconds(30);
|
||||
}
|
||||
Reference in New Issue
Block a user