PR 4.5 — ReconnectSupervisor

State machine that drives GalaxyDriver's recovery from gw transport
failure. Healthy → TransportLost → Reopening → Replaying → Healthy. Drivers
report failure signals; the supervisor runs reopen + replay with capped
exponential backoff (default 500ms → 30s) until both succeed.

Files:
- Runtime/ReconnectSupervisor.cs — state machine with snapshot, change
  event, last-error tracking, and a one-attempt-at-a-time recovery loop.
  Idempotent ReportTransportFailure: repeated failure reports during an
  in-flight recovery do not spawn parallel loops. Reopen + replay are
  caller-supplied callbacks (the driver injects them in the wire-up PR);
  reopen re-Registers the gw session, replay re-establishes every active
  subscription via gw's ReplaySubscriptionsCommand (mxaccessgw issue gw-3)
  or the SubscribeBulk fallback. Dispose cancels the loop cleanly.
- Public StateTransition record + IsDegraded predicate the driver maps
  to DriverState.Degraded for health snapshots.

Wiring (GalaxyDriver subscribes the supervisor to its EventPump's
transport-failure signal, exposes IsDegraded through GetHealth(), routes
reopen/replay callbacks through GalaxyMxSession + SubscriptionRegistry)
lands in PR 4.W to avoid conflict with the parallel host-probe track
(PR 4.7) and align the wire-up with the rest of Phase 4's plumbing.

9 supervisor tests (full state-machine traversal, retry-until-success on
both reopen and replay failures, idempotent failure reports, last-error
propagation, Dispose mid-recovery, post-dispose throws, fast-path Healthy
WaitForHealthy).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Joseph Doherty
2026-04-29 15:39:21 -04:00
parent 7922e573b1
commit 123e3e48b9
2 changed files with 441 additions and 0 deletions

View File

@@ -0,0 +1,268 @@
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Logging.Abstractions;
namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Runtime;
/// <summary>
/// Coordinates GalaxyDriver's recovery from gateway transport failure. Drives a
/// state machine — <c>Healthy → TransportLost → Reopening → Replaying → Healthy</c>
/// — and exposes the current state through a snapshot + change event so the
/// driver's <c>DriverHealth</c> reflects <c>Degraded</c> while we're not in
/// <c>Healthy</c>.
/// </summary>
/// <remarks>
/// <para>
/// The supervisor doesn't own the session, the subscription registry, or the
/// event pump. It receives transport-failure signals from the rest of the
/// driver (EventPump throws, a gw RPC raises, the heartbeat times out), runs
/// a one-attempt-at-a-time recovery loop, and lets the rest of the driver
/// continue serving cached state during recovery.
/// </para>
/// <para>
/// <b>Reopen</b>: caller-supplied callback that re-opens the gw session +
/// re-Registers the MXAccess client. Throws on failure.
/// </para>
/// <para>
/// <b>Replay</b>: caller-supplied callback that re-establishes every active
/// subscription. Production wraps gw's <c>ReplaySubscriptionsCommand</c>
/// (mxaccessgw issue #0.3); when that's not available, the callback falls
/// back to walking the SubscriptionRegistry and re-issuing SubscribeBulk for
/// every tracked tag.
/// </para>
/// <para>
/// Backoff is capped exponential — first retry after
/// <see cref="ReconnectOptions.InitialBackoff"/>, doubled per failed attempt,
/// capped at <see cref="ReconnectOptions.MaxBackoff"/>. Persistent failures
/// hold the supervisor in <c>Reopening</c> indefinitely; the supervisor never
/// gives up on its own — operators / Phase 6.4 soak handle that policy.
/// </para>
/// </remarks>
public sealed class ReconnectSupervisor : IDisposable
{
/// <summary>Recovery state machine.</summary>
public enum State
{
Healthy,
TransportLost,
Reopening,
Replaying,
}
private readonly Func<CancellationToken, Task> _reopen;
private readonly Func<CancellationToken, Task> _replay;
private readonly ReconnectOptions _options;
private readonly ILogger _logger;
private readonly Func<int, TimeSpan, TimeSpan, TimeSpan>? _backoffDelay;
private readonly Lock _stateLock = new();
private State _state = State.Healthy;
private string? _lastError;
private DateTime? _lastTransitionUtc;
private Task? _recoveryLoop;
private CancellationTokenSource? _loopCts;
private bool _disposed;
/// <summary>Fires after every state transition.</summary>
public event EventHandler<StateTransition>? StateChanged;
public ReconnectSupervisor(
Func<CancellationToken, Task> reopen,
Func<CancellationToken, Task> replay,
ReconnectOptions? options = null,
ILogger? logger = null,
Func<int, TimeSpan, TimeSpan, TimeSpan>? backoffDelay = null)
{
_reopen = reopen ?? throw new ArgumentNullException(nameof(reopen));
_replay = replay ?? throw new ArgumentNullException(nameof(replay));
_options = options ?? new ReconnectOptions();
_logger = logger ?? NullLogger.Instance;
_backoffDelay = backoffDelay;
}
/// <summary>Current state. Healthy = fully recovered + subscriptions live.</summary>
public State CurrentState
{
get { lock (_stateLock) return _state; }
}
/// <summary>True when CurrentState != Healthy. Drivers map this to DriverState.Degraded.</summary>
public bool IsDegraded
{
get { lock (_stateLock) return _state != State.Healthy; }
}
public string? LastError
{
get { lock (_stateLock) return _lastError; }
}
public DateTime? LastTransitionUtc
{
get { lock (_stateLock) return _lastTransitionUtc; }
}
/// <summary>
/// Notify the supervisor that a gw transport failure has been observed. Idempotent —
/// repeated calls during an in-flight recovery do not start a parallel loop. The
/// first call spawns a background task that drives reopen → replay until it
/// succeeds or <see cref="Dispose"/> cancels it.
/// </summary>
public void ReportTransportFailure(Exception cause)
{
ArgumentNullException.ThrowIfNull(cause);
ObjectDisposedException.ThrowIf(_disposed, this);
lock (_stateLock)
{
_lastError = cause.Message;
if (_state != State.Healthy)
{
// Already recovering — nothing else to do.
_logger.LogDebug("Transport failure reported during {State}: {Message}", _state, cause.Message);
return;
}
TransitionLocked(State.TransportLost, cause.Message);
_loopCts = new CancellationTokenSource();
_recoveryLoop = Task.Run(() => RecoveryLoopAsync(_loopCts.Token));
}
}
/// <summary>
/// Wait until the current recovery cycle reaches Healthy or the supplied token
/// is cancelled. Returns immediately when already Healthy. Useful for tests
/// and for orchestration that wants to gate calls on recovery completing.
/// </summary>
public async Task WaitForHealthyAsync(CancellationToken cancellationToken)
{
while (!cancellationToken.IsCancellationRequested && IsDegraded)
{
await Task.Delay(50, cancellationToken).ConfigureAwait(false);
}
}
private async Task RecoveryLoopAsync(CancellationToken ct)
{
var attempt = 0;
while (!ct.IsCancellationRequested)
{
attempt++;
if (attempt > 1)
{
var delay = ComputeBackoff(attempt);
_logger.LogInformation(
"Galaxy reconnect attempt {Attempt} — waiting {Delay} before retry", attempt, delay);
try { await Task.Delay(delay, ct).ConfigureAwait(false); }
catch (OperationCanceledException) { return; }
}
// === Reopening phase ===
lock (_stateLock) TransitionLocked(State.Reopening, _lastError);
try
{
await _reopen(ct).ConfigureAwait(false);
}
catch (OperationCanceledException) when (ct.IsCancellationRequested) { return; }
catch (Exception ex)
{
_logger.LogWarning(ex, "Galaxy reopen failed (attempt {Attempt})", attempt);
lock (_stateLock) { _lastError = ex.Message; }
continue; // back to backoff + retry
}
// === Replaying phase ===
lock (_stateLock) TransitionLocked(State.Replaying, _lastError);
try
{
await _replay(ct).ConfigureAwait(false);
}
catch (OperationCanceledException) when (ct.IsCancellationRequested) { return; }
catch (Exception ex)
{
_logger.LogWarning(ex, "Galaxy replay failed (attempt {Attempt})", attempt);
lock (_stateLock) { _lastError = ex.Message; }
continue; // back to backoff + retry
}
// === Done ===
lock (_stateLock)
{
_lastError = null;
TransitionLocked(State.Healthy, null);
}
_logger.LogInformation("Galaxy reconnect succeeded after {Attempt} attempt(s)", attempt);
return;
}
}
private TimeSpan ComputeBackoff(int attempt)
{
if (_backoffDelay is not null)
return _backoffDelay(attempt, _options.InitialBackoff, _options.MaxBackoff);
// Standard capped exponential — InitialBackoff * 2^(attempt-2), capped at MaxBackoff.
// Attempt 2 → InitialBackoff, attempt 3 → 2x, attempt 4 → 4x, etc.
var multiplier = Math.Min(1L << Math.Max(0, attempt - 2), int.MaxValue);
var ticks = _options.InitialBackoff.Ticks * multiplier;
if (ticks <= 0 || ticks > _options.MaxBackoff.Ticks) ticks = _options.MaxBackoff.Ticks;
return TimeSpan.FromTicks(ticks);
}
private void TransitionLocked(State next, string? cause)
{
if (next == _state) return;
var previous = _state;
_state = next;
_lastTransitionUtc = DateTime.UtcNow;
var transition = new StateTransition(previous, next, cause, _lastTransitionUtc.Value);
try { StateChanged?.Invoke(this, transition); }
catch (Exception ex)
{
_logger.LogWarning(ex,
"Galaxy reconnect StateChanged handler threw — continuing.");
}
}
public void Dispose()
{
if (_disposed) return;
_disposed = true;
CancellationTokenSource? cts;
Task? loop;
lock (_stateLock) { cts = _loopCts; loop = _recoveryLoop; _loopCts = null; _recoveryLoop = null; }
cts?.Cancel();
if (loop is not null)
{
try { loop.GetAwaiter().GetResult(); } catch { /* shutdown */ }
}
cts?.Dispose();
}
}
/// <summary>
/// One state transition observed by the supervisor.
/// </summary>
public sealed record StateTransition(
ReconnectSupervisor.State Previous,
ReconnectSupervisor.State Next,
string? Cause,
DateTime AtUtc);
/// <summary>
/// Knobs for the supervisor's backoff. <see cref="ReconnectOptions"/> on the driver
/// options record (PR 4.0) maps onto this — they're separate types so the supervisor
/// can be exercised in tests without the full driver options surface.
/// </summary>
public sealed record ReconnectOptions(
TimeSpan? InitialBackoffOverride = null,
TimeSpan? MaxBackoffOverride = null)
{
public TimeSpan InitialBackoff => InitialBackoffOverride ?? TimeSpan.FromMilliseconds(500);
public TimeSpan MaxBackoff => MaxBackoffOverride ?? TimeSpan.FromSeconds(30);
}

View File

@@ -0,0 +1,173 @@
using Shouldly;
using Xunit;
using ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Runtime;
namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Tests.Runtime;
/// <summary>
/// Tests for <see cref="ReconnectSupervisor"/>'s state machine + backoff. Each
/// scenario drives the supervisor with controllable reopen/replay callbacks and
/// observes the resulting state transitions.
/// </summary>
public sealed class ReconnectSupervisorTests
{
private const int WaitMs = 2_000;
private static ReconnectOptions FastBackoff() =>
new(InitialBackoffOverride: TimeSpan.FromMilliseconds(5),
MaxBackoffOverride: TimeSpan.FromMilliseconds(20));
[Fact]
public void InitialState_IsHealthy()
{
using var sup = new ReconnectSupervisor(_ => Task.CompletedTask, _ => Task.CompletedTask, FastBackoff());
sup.CurrentState.ShouldBe(ReconnectSupervisor.State.Healthy);
sup.IsDegraded.ShouldBeFalse();
sup.LastError.ShouldBeNull();
}
[Fact]
public async Task ReportTransportFailure_DrivesThroughReopenReplay_BackToHealthy()
{
var transitions = new List<StateTransition>();
var lockObj = new object();
using var sup = new ReconnectSupervisor(
reopen: _ => Task.CompletedTask,
replay: _ => Task.CompletedTask,
options: FastBackoff());
sup.StateChanged += (_, t) => { lock (lockObj) transitions.Add(t); };
sup.ReportTransportFailure(new IOException("transport drop"));
await sup.WaitForHealthyAsync(new CancellationTokenSource(WaitMs).Token);
// Expected sequence: Healthy → TransportLost → Reopening → Replaying → Healthy.
IReadOnlyList<StateTransition> snapshot;
lock (lockObj) snapshot = [.. transitions];
var states = snapshot.Select(t => t.Next).ToArray();
states.ShouldContain(ReconnectSupervisor.State.TransportLost);
states.ShouldContain(ReconnectSupervisor.State.Reopening);
states.ShouldContain(ReconnectSupervisor.State.Replaying);
states[^1].ShouldBe(ReconnectSupervisor.State.Healthy);
sup.IsDegraded.ShouldBeFalse();
}
[Fact]
public async Task ReopenFailure_RetriesUntilSuccess_StaysInReopeningBetweenAttempts()
{
var attempts = 0;
using var sup = new ReconnectSupervisor(
reopen: _ => { attempts++; return attempts < 3 ? Task.FromException(new IOException("nope")) : Task.CompletedTask; },
replay: _ => Task.CompletedTask,
options: FastBackoff());
sup.ReportTransportFailure(new IOException("kick off"));
await sup.WaitForHealthyAsync(new CancellationTokenSource(WaitMs).Token);
attempts.ShouldBe(3);
sup.CurrentState.ShouldBe(ReconnectSupervisor.State.Healthy);
sup.LastError.ShouldBeNull(); // cleared on Healthy transition
}
[Fact]
public async Task ReplayFailure_RetriesEntireCycle()
{
var reopens = 0;
var replays = 0;
using var sup = new ReconnectSupervisor(
reopen: _ => { reopens++; return Task.CompletedTask; },
replay: _ => { replays++; return replays < 2 ? Task.FromException(new IOException("replay nope")) : Task.CompletedTask; },
options: FastBackoff());
sup.ReportTransportFailure(new IOException("kick off"));
await sup.WaitForHealthyAsync(new CancellationTokenSource(WaitMs).Token);
// First cycle: reopen succeeds, replay fails. Second cycle: both succeed.
reopens.ShouldBe(2);
replays.ShouldBe(2);
sup.CurrentState.ShouldBe(ReconnectSupervisor.State.Healthy);
}
[Fact]
public async Task RepeatedFailureReports_DuringRecovery_DoNotSpawnParallelLoops()
{
var attempts = 0;
using var sup = new ReconnectSupervisor(
reopen: async ct =>
{
attempts++;
await Task.Delay(50, ct).ConfigureAwait(false);
},
replay: _ => Task.CompletedTask,
options: FastBackoff());
sup.ReportTransportFailure(new IOException("first"));
// Fire several more reports while reopen is in flight.
for (var i = 0; i < 5; i++)
{
sup.ReportTransportFailure(new IOException($"rapid-{i}"));
}
await sup.WaitForHealthyAsync(new CancellationTokenSource(WaitMs).Token);
// One Reopen call regardless of how many failures arrived during recovery.
attempts.ShouldBe(1);
}
[Fact]
public async Task LastError_ReflectsMostRecentFailureCause()
{
using var sup = new ReconnectSupervisor(
reopen: _ => Task.FromException(new IOException("reopen broke")),
replay: _ => Task.CompletedTask,
options: new ReconnectOptions(
InitialBackoffOverride: TimeSpan.FromMilliseconds(5),
MaxBackoffOverride: TimeSpan.FromMilliseconds(10)));
sup.ReportTransportFailure(new IOException("initial"));
// Allow the loop to attempt at least twice.
await Task.Delay(100);
sup.LastError.ShouldNotBeNull();
sup.LastError.ShouldContain("reopen broke"); // updates from the loop's failed reopen attempts
}
[Fact]
public async Task Dispose_CancelsRunningRecoveryLoop_Cleanly()
{
var cancelled = false;
var sup = new ReconnectSupervisor(
reopen: async ct =>
{
try { await Task.Delay(10_000, ct).ConfigureAwait(false); }
catch (OperationCanceledException) { cancelled = true; throw; }
},
replay: _ => Task.CompletedTask,
options: FastBackoff());
sup.ReportTransportFailure(new IOException("kick off"));
await Task.Delay(50); // let the loop start the long reopen
Should.NotThrow(() => sup.Dispose());
cancelled.ShouldBeTrue();
}
[Fact]
public void ReportTransportFailure_AfterDispose_Throws()
{
var sup = new ReconnectSupervisor(_ => Task.CompletedTask, _ => Task.CompletedTask, FastBackoff());
sup.Dispose();
Should.Throw<ObjectDisposedException>(() => sup.ReportTransportFailure(new IOException("x")));
}
[Fact]
public async Task WaitForHealthy_ReturnsImmediately_WhenAlreadyHealthy()
{
using var sup = new ReconnectSupervisor(_ => Task.CompletedTask, _ => Task.CompletedTask, FastBackoff());
// No failure reported — should be Healthy from the start.
var deadline = DateTime.UtcNow.AddMilliseconds(50);
await sup.WaitForHealthyAsync(new CancellationTokenSource(50).Token);
DateTime.UtcNow.ShouldBeLessThan(deadline.AddMilliseconds(100)); // returned promptly
}
}