mbproxy: initial commit through Phase 9 (TxId multiplexing)
Adds the mbproxy service end-to-end. Phases 00-08 implement the production-ready single-listener / 1:1-backend transparent Modbus TCP proxy with bidirectional BCD rewriting for the ~54-PLC DL205/DL260 fleet. Phase 9 replaces the connection layer with a single backend socket per PLC plus MBAP TxId rewriting, lifting the H2-ECOM100's 4-concurrent-client cap as an operational ceiling. Phase 9 additions of note: - PlcMultiplexer + UpstreamPipe + TxIdAllocator + CorrelationMap - InFlightRequest with IReadOnlyList<InterestedParty> (load-bearing for Phase 10 read coalescing — do not collapse to a single field) - Per-request watchdog: surfaces Modbus exception 0x0B to upstream on BackendRequestTimeoutMs, defending against lost responses, dead-PLC paths, and pymodbus 3.13.0's concurrent-multiplexed- request bug (its ServerRequestHandler.last_pdu state race) - Status DTO + HTML gain inFlight / maxInFlight / txIdWraps / disconnectCascades / queueDepth (Tier 1.6 in docs/kpi.md) Tests: 263 unit + 38 E2E. Multiplexer correctness under truly concurrent backend traffic is proved against a stub backend in PlcMultiplexerTests; MultiplexerE2ETests paces requests so pymodbus 3.13's single-PDU framer stays in known-good mode. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,404 @@
|
||||
using Mbproxy.Options;
|
||||
using Mbproxy.Proxy.Multiplexing;
|
||||
using Polly;
|
||||
|
||||
namespace Mbproxy.Proxy.Supervision;
|
||||
|
||||
/// <summary>
|
||||
/// Wraps one <see cref="PlcListener"/> in a Polly-backed recovery loop.
|
||||
///
|
||||
/// <para><b>State machine</b>:
|
||||
/// <list type="bullet">
|
||||
/// <item><description><b>Bound</b>: listener is accepting connections; <see cref="PlcListener.RunAsync"/> is awaiting.</description></item>
|
||||
/// <item><description><b>Recovering</b>: bind failed or RunAsync faulted; in Polly's delay window before the next attempt.</description></item>
|
||||
/// <item><description><b>Stopped</b>: terminal. <see cref="StopAsync"/> was called; no further retries.</description></item>
|
||||
/// </list>
|
||||
/// </para>
|
||||
///
|
||||
/// <para><b>RecoveryAttempts</b>: the counter accumulates over the lifetime of the
|
||||
/// supervisor. It is never reset after a successful re-bind so operators can see
|
||||
/// "this listener has flapped N times since the service started." See also
|
||||
/// <see cref="SupervisorSnapshot"/> doc comment.</para>
|
||||
///
|
||||
/// <para>The supervisor does NOT swallow exceptions from <see cref="PlcListener.RunAsync"/>
|
||||
/// except <see cref="OperationCanceledException"/>. Every other fault is logged at Warning
|
||||
/// with the exception message so operators can see WHY the listener was restarted.</para>
|
||||
/// </summary>
|
||||
internal sealed partial class PlcListenerSupervisor : IAsyncDisposable
|
||||
{
|
||||
private readonly PlcOptions _plc;
|
||||
private readonly ConnectionOptions _connectionOptions;
|
||||
private readonly IPduPipeline _pipeline;
|
||||
private readonly ILogger<PlcListener> _listenerLogger;
|
||||
private readonly ILogger<PlcMultiplexer> _multiplexerLogger;
|
||||
private readonly ILogger _pipeLogger;
|
||||
private readonly PerPlcContext? _perPlcContext;
|
||||
private readonly ResiliencePipeline _recoveryPipeline;
|
||||
private readonly ILogger<PlcListenerSupervisor> _logger;
|
||||
private readonly ResiliencePipeline? _backendConnectPipeline;
|
||||
|
||||
// ── Mutable state ────────────────────────────────────────────────────────────────────
|
||||
|
||||
// Volatile so Snapshot() reads are coherent without locking.
|
||||
private volatile SupervisorState _state = SupervisorState.Stopped;
|
||||
private volatile string? _lastBindError;
|
||||
private int _recoveryAttempts; // Interlocked
|
||||
|
||||
// Phase 07: current active listener for status-page pair enumeration.
|
||||
private volatile PlcListener? _currentListener;
|
||||
|
||||
// Phase 06: _perPlcContext is now mutable so ReplaceContextAsync can swap it.
|
||||
// Access from the accept loop (RunAsync) and from ReplaceContextAsync must be
|
||||
// coherent; we use a volatile reference so the accept loop always reads the latest
|
||||
// context without locking. The PlcListener created on each Polly attempt holds
|
||||
// its own copy of the context at construction time; existing in-flight connections
|
||||
// keep their old reference until they complete.
|
||||
private volatile PerPlcContext? _currentContext;
|
||||
|
||||
/// <summary>
|
||||
/// Per-supervisor CTS: cancelling it stops both the Polly delay and the inner
|
||||
/// <see cref="PlcListener.RunAsync"/> loop.
|
||||
/// </summary>
|
||||
private CancellationTokenSource _supervisorCts = new();
|
||||
|
||||
private Task _supervisorTask = Task.CompletedTask;
|
||||
|
||||
private bool _disposed;
|
||||
|
||||
// ── Public surface ────────────────────────────────────────────────────────────────────
|
||||
|
||||
public string PlcName => _plc.Name;
|
||||
|
||||
public PlcListenerSupervisor(
|
||||
PlcOptions plc,
|
||||
ConnectionOptions connectionOptions,
|
||||
IPduPipeline pipeline,
|
||||
ILogger<PlcListener> listenerLogger,
|
||||
ILogger<PlcMultiplexer> multiplexerLogger,
|
||||
ILogger pipeLogger,
|
||||
PerPlcContext? perPlcContext,
|
||||
ResiliencePipeline recoveryPipeline,
|
||||
ILogger<PlcListenerSupervisor> logger,
|
||||
ResiliencePipeline? backendConnectPipeline = null)
|
||||
{
|
||||
_plc = plc;
|
||||
_connectionOptions = connectionOptions;
|
||||
_pipeline = pipeline;
|
||||
_listenerLogger = listenerLogger;
|
||||
_multiplexerLogger = multiplexerLogger;
|
||||
_pipeLogger = pipeLogger;
|
||||
_perPlcContext = perPlcContext;
|
||||
_currentContext = perPlcContext; // Phase 06: live context slot
|
||||
_recoveryPipeline = recoveryPipeline;
|
||||
_logger = logger;
|
||||
_backendConnectPipeline = backendConnectPipeline;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns the current <see cref="ProxyCounters"/> for this PLC.
|
||||
/// Used by <see cref="Configuration.ConfigReconciler"/> when building a reseat context
|
||||
/// so that counters are preserved across a tag-map swap.
|
||||
/// </summary>
|
||||
public ProxyCounters CurrentCounters => _currentContext?.Counters ?? new ProxyCounters();
|
||||
|
||||
/// <summary>
|
||||
/// Live collection of active <see cref="UpstreamPipe"/> instances attached to this
|
||||
/// PLC's multiplexer. Returns an empty collection when the listener is not bound.
|
||||
/// Consumed by Phase 07's status page (renamed from <c>ActivePairs</c> in Phase 9).
|
||||
/// </summary>
|
||||
public IReadOnlyCollection<UpstreamPipe> ActiveUpstreams
|
||||
=> _currentListener?.ActiveUpstreams ?? Array.Empty<UpstreamPipe>();
|
||||
|
||||
/// <summary>
|
||||
/// Launches the supervisor task. The task tries to bind immediately; if binding
|
||||
/// fails it enters the Polly recovery loop. The method returns as soon as the
|
||||
/// background task is started (it does NOT wait for the listener to reach
|
||||
/// <see cref="SupervisorState.Bound"/>).
|
||||
///
|
||||
/// <para>Call <see cref="WaitForInitialBindAttemptAsync"/> after this to block until the
|
||||
/// supervisor has transitioned out of <see cref="SupervisorState.Stopped"/>.</para>
|
||||
/// </summary>
|
||||
public Task StartAsync(CancellationToken ct)
|
||||
{
|
||||
_supervisorCts = CancellationTokenSource.CreateLinkedTokenSource(ct);
|
||||
_supervisorTask = Task.Run(() => RunSupervisorAsync(_supervisorCts.Token), CancellationToken.None);
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Waits until the supervisor has completed its first bind attempt
|
||||
/// (transitioned to <see cref="SupervisorState.Bound"/> or
|
||||
/// <see cref="SupervisorState.Recovering"/>).
|
||||
/// Returns immediately if the supervisor is already past that point.
|
||||
/// </summary>
|
||||
public async Task WaitForInitialBindAttemptAsync(CancellationToken ct)
|
||||
{
|
||||
while (_state == SupervisorState.Stopped && !ct.IsCancellationRequested
|
||||
&& !_supervisorTask.IsCompleted)
|
||||
{
|
||||
await Task.Delay(10, ct).ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Signals the supervisor to stop, cancels the current Polly delay (if in
|
||||
/// <see cref="SupervisorState.Recovering"/>) or the <see cref="PlcListener.RunAsync"/>
|
||||
/// loop (if in <see cref="SupervisorState.Bound"/>), and waits for the background
|
||||
/// task to complete.
|
||||
///
|
||||
/// <para>Completes within ~1 s regardless of backoff window size because Polly's
|
||||
/// <c>ExecuteAsync(ct)</c> honours the cancellation token.</para>
|
||||
/// </summary>
|
||||
public async Task StopAsync(CancellationToken ct)
|
||||
{
|
||||
_state = SupervisorState.Stopped;
|
||||
|
||||
await _supervisorCts.CancelAsync().ConfigureAwait(false);
|
||||
|
||||
try
|
||||
{
|
||||
await _supervisorTask.WaitAsync(ct).ConfigureAwait(false);
|
||||
}
|
||||
catch (OperationCanceledException)
|
||||
{
|
||||
// ct fired before the task completed — supervisor task will terminate
|
||||
// asynchronously. Acceptable at shutdown.
|
||||
}
|
||||
catch (Exception)
|
||||
{
|
||||
// Supervisor task faulted — already logged inside RunSupervisorAsync.
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>Returns a point-in-time snapshot of this supervisor's state.</summary>
|
||||
public SupervisorSnapshot Snapshot() => new(
|
||||
State: _state,
|
||||
LastBindError: _lastBindError,
|
||||
RecoveryAttempts: Interlocked.CompareExchange(ref _recoveryAttempts, 0, 0));
|
||||
|
||||
/// <summary>
|
||||
/// Atomically swaps the per-PLC context (tag map) without restarting the listener.
|
||||
///
|
||||
/// <para><b>Transition window</b>: there is a brief overlap where the old
|
||||
/// <see cref="PlcListener"/> is running its accept loop with the old context while the
|
||||
/// new context reference is being written. The volatile write ensures that the very
|
||||
/// next <c>PlcListener</c> constructed inside the Polly loop (on any subsequent fault
|
||||
/// recovery) picks up <paramref name="newCtx"/>. Existing in-flight upstream pipes
|
||||
/// served by the current multiplexer keep their reference to the context captured at
|
||||
/// multiplexer construction time; they finish on the old map. New connections after
|
||||
/// this call use the new map. This is the correct design — partial-BCD rewrites
|
||||
/// mid-request would be worse than a one-request gap.</para>
|
||||
///
|
||||
/// <para>This method is intentionally lightweight: it performs only the volatile write
|
||||
/// and returns immediately. The <paramref name="ct"/> parameter is present for API
|
||||
/// symmetry with start/stop and to accommodate future async expansion.</para>
|
||||
/// </summary>
|
||||
public Task ReplaceContextAsync(PerPlcContext newCtx, CancellationToken ct)
|
||||
{
|
||||
// Volatile write: the next PlcListener created in RunSupervisorAsync will see
|
||||
// the new context. The accept loop itself does not hold a direct reference to
|
||||
// _currentContext — it was captured at PlcListener construction time.
|
||||
_currentContext = newCtx;
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
|
||||
// ── Supervisor loop ───────────────────────────────────────────────────────────────────
|
||||
|
||||
private async Task RunSupervisorAsync(CancellationToken ct)
|
||||
{
|
||||
bool firstBind = true;
|
||||
|
||||
try
|
||||
{
|
||||
// The recovery pipeline wraps the entire try-bind-and-run block.
|
||||
// When RunAsync returns or throws, the pipeline delays and retries.
|
||||
// Cancellation of ct exits the pipeline with OperationCanceledException.
|
||||
await _recoveryPipeline.ExecuteAsync(async token =>
|
||||
{
|
||||
// ── Instantiate a fresh listener ─────────────────────────────────
|
||||
// A faulted listener's TcpListener socket must be disposed before
|
||||
// re-binding. We create a new PlcListener on each attempt.
|
||||
//
|
||||
// Phase 06: use _currentContext (volatile) so that a ReplaceContextAsync
|
||||
// call between Polly retry attempts is picked up here. Each listener
|
||||
// captures the context at construction time; existing in-flight pairs
|
||||
// keep their own reference. See ReplaceContextAsync for the transition
|
||||
// window documentation.
|
||||
var listener = new PlcListener(
|
||||
_plc,
|
||||
_connectionOptions,
|
||||
_pipeline,
|
||||
_listenerLogger,
|
||||
_multiplexerLogger,
|
||||
_pipeLogger,
|
||||
_currentContext,
|
||||
_backendConnectPipeline);
|
||||
|
||||
// Phase 07: expose the current listener for status-page pair enumeration.
|
||||
_currentListener = listener;
|
||||
|
||||
try
|
||||
{
|
||||
// ── Bind ─────────────────────────────────────────────────────
|
||||
listener.StartAsync();
|
||||
}
|
||||
catch (Exception bindEx)
|
||||
{
|
||||
// Dispose the listener before entering the recovery delay
|
||||
// so the socket is released and the port can be reused.
|
||||
_currentListener = null;
|
||||
await listener.DisposeAsync().ConfigureAwait(false);
|
||||
|
||||
Interlocked.Increment(ref _recoveryAttempts);
|
||||
string reason = bindEx.Message;
|
||||
string truncated = reason.Length > 256 ? reason[..256] : reason;
|
||||
_lastBindError = truncated;
|
||||
_state = SupervisorState.Recovering;
|
||||
|
||||
// Also update the per-PLC counters if available (Phase 07 reads these).
|
||||
_currentContext?.Counters.IncrementRecoveryAttempt(truncated);
|
||||
|
||||
LogBindFailed(_logger, _plc.Name, _plc.ListenPort, truncated);
|
||||
|
||||
// Re-throw so the Polly pipeline can delay and retry.
|
||||
throw;
|
||||
}
|
||||
|
||||
// ── Bind succeeded ───────────────────────────────────────────────
|
||||
if (firstBind)
|
||||
{
|
||||
firstBind = false;
|
||||
LogBound(_logger, _plc.Name, _plc.ListenPort);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Re-bind after a recovery — emit the "recovered" event once.
|
||||
int totalAttempts = Interlocked.CompareExchange(ref _recoveryAttempts, 0, 0);
|
||||
LogListenerRecovered(_logger, _plc.Name, _plc.ListenPort, totalAttempts);
|
||||
}
|
||||
|
||||
// Clear the last bind error on a successful bind.
|
||||
_lastBindError = null;
|
||||
_currentContext?.Counters.ClearLastBindError();
|
||||
_state = SupervisorState.Bound;
|
||||
|
||||
// ── Run the accept loop ──────────────────────────────────────────
|
||||
// RunAsync returns when: (a) token is cancelled (normal shutdown),
|
||||
// (b) the listener faults (OS reclaims port, transient network reset).
|
||||
// In both cases we fall through to the Polly retry handler.
|
||||
try
|
||||
{
|
||||
await listener.RunAsync(token).ConfigureAwait(false);
|
||||
}
|
||||
catch (OperationCanceledException)
|
||||
{
|
||||
// Normal shutdown path — do not enter recovery loop.
|
||||
_currentListener = null;
|
||||
await listener.DisposeAsync().ConfigureAwait(false);
|
||||
throw; // Propagate to exit the Polly pipeline.
|
||||
}
|
||||
catch (Exception runEx)
|
||||
{
|
||||
// Listener faulted at runtime (port stolen, OS network reset, etc.).
|
||||
// Log at Warning — operators must see WHY the listener was restarted.
|
||||
LogListenerFaulted(_logger, _plc.Name, _plc.ListenPort, runEx, runEx.Message);
|
||||
_currentListener = null;
|
||||
await listener.DisposeAsync().ConfigureAwait(false);
|
||||
|
||||
Interlocked.Increment(ref _recoveryAttempts);
|
||||
string truncated = runEx.Message.Length > 256 ? runEx.Message[..256] : runEx.Message;
|
||||
_lastBindError = truncated;
|
||||
_state = SupervisorState.Recovering;
|
||||
|
||||
// Also update the per-PLC counters if available.
|
||||
_currentContext?.Counters.IncrementRecoveryAttempt(truncated);
|
||||
|
||||
// Re-throw so Polly can delay and retry.
|
||||
throw;
|
||||
}
|
||||
|
||||
// RunAsync returned normally (token was cancelled or listener closed).
|
||||
// If we got here without an exception, the loop ended cleanly.
|
||||
_currentListener = null;
|
||||
await listener.DisposeAsync().ConfigureAwait(false);
|
||||
|
||||
// If cancellation is requested, throw so Polly exits cleanly.
|
||||
token.ThrowIfCancellationRequested();
|
||||
|
||||
// Otherwise (listener closed without cancellation — e.g., OS event),
|
||||
// treat as a fault and re-enter recovery.
|
||||
Interlocked.Increment(ref _recoveryAttempts);
|
||||
const string unexpectedEnd = "Listener accept loop ended unexpectedly";
|
||||
_lastBindError = unexpectedEnd;
|
||||
_state = SupervisorState.Recovering;
|
||||
_currentContext?.Counters.IncrementRecoveryAttempt(unexpectedEnd);
|
||||
LogListenerEnded(_logger, _plc.Name, _plc.ListenPort);
|
||||
throw new InvalidOperationException(unexpectedEnd);
|
||||
|
||||
}, ct).ConfigureAwait(false);
|
||||
}
|
||||
catch (OperationCanceledException)
|
||||
{
|
||||
// Normal: StopAsync cancelled the token.
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
// Polly pipeline exhausted (should not happen for listener recovery since
|
||||
// MaxRetryAttempts = int.MaxValue) or an unexpected fault.
|
||||
_logger.LogError(ex, "Supervisor for Plc={Plc} exited unexpectedly: {Message}",
|
||||
_plc.Name, ex.Message);
|
||||
}
|
||||
finally
|
||||
{
|
||||
_state = SupervisorState.Stopped;
|
||||
_currentListener = null;
|
||||
}
|
||||
}
|
||||
|
||||
// ── IAsyncDisposable ─────────────────────────────────────────────────────────────────
|
||||
|
||||
public async ValueTask DisposeAsync()
|
||||
{
|
||||
if (_disposed) return;
|
||||
_disposed = true;
|
||||
|
||||
using var stopCts = new CancellationTokenSource(TimeSpan.FromSeconds(5));
|
||||
try
|
||||
{
|
||||
await StopAsync(stopCts.Token).ConfigureAwait(false);
|
||||
}
|
||||
catch
|
||||
{
|
||||
// Best-effort cleanup.
|
||||
}
|
||||
|
||||
_supervisorCts.Dispose();
|
||||
}
|
||||
|
||||
// ── Logging ───────────────────────────────────────────────────────────────────────────
|
||||
|
||||
[LoggerMessage(EventId = 40, EventName = "mbproxy.startup.bind",
|
||||
Level = LogLevel.Information,
|
||||
Message = "Listener bound: Plc={Plc} Port={Port}")]
|
||||
private static partial void LogBound(ILogger logger, string plc, int port);
|
||||
|
||||
[LoggerMessage(EventId = 41, EventName = "mbproxy.startup.bind.failed",
|
||||
Level = LogLevel.Error,
|
||||
Message = "Failed to bind listener: Plc={Plc} Port={Port} Reason={Reason}")]
|
||||
private static partial void LogBindFailed(ILogger logger, string plc, int port, string reason);
|
||||
|
||||
[LoggerMessage(EventId = 42, EventName = "mbproxy.listener.recovered",
|
||||
Level = LogLevel.Information,
|
||||
Message = "Listener recovered: Plc={Plc} Port={Port} AttemptCount={AttemptCount}")]
|
||||
private static partial void LogListenerRecovered(ILogger logger, string plc, int port, int attemptCount);
|
||||
|
||||
[LoggerMessage(EventId = 43, EventName = "mbproxy.listener.faulted",
|
||||
Level = LogLevel.Warning,
|
||||
Message = "Listener faulted (will recover): Plc={Plc} Port={Port} Reason={Reason}")]
|
||||
private static partial void LogListenerFaulted(ILogger logger, string plc, int port, Exception ex, string reason);
|
||||
|
||||
[LoggerMessage(EventId = 44, EventName = "mbproxy.listener.ended",
|
||||
Level = LogLevel.Warning,
|
||||
Message = "Listener accept loop ended unexpectedly (will recover): Plc={Plc} Port={Port}")]
|
||||
private static partial void LogListenerEnded(ILogger logger, string plc, int port);
|
||||
}
|
||||
@@ -0,0 +1,125 @@
|
||||
using System.Net.Sockets;
|
||||
using Mbproxy.Options;
|
||||
using Polly;
|
||||
using Polly.Retry;
|
||||
|
||||
namespace Mbproxy.Proxy.Supervision;
|
||||
|
||||
/// <summary>
|
||||
/// Builds Polly v8 <see cref="ResiliencePipeline"/> instances from the typed resilience
|
||||
/// configuration (<see cref="RetryProfile"/> and <see cref="RecoveryProfile"/>).
|
||||
///
|
||||
/// <para>Pipelines are built once at startup and reused across all operations. They are
|
||||
/// thread-safe and allocation-free on the happy path.</para>
|
||||
/// </summary>
|
||||
internal static class PolicyFactory
|
||||
{
|
||||
// ── Network errors that are safe to retry on backend connect ────────────────────────
|
||||
// Only these SocketError values are transient; everything else is a programming error
|
||||
// or a configuration mistake and should not be retried.
|
||||
private static readonly HashSet<SocketError> RetryableSocketErrors =
|
||||
[
|
||||
SocketError.ConnectionRefused,
|
||||
SocketError.TimedOut,
|
||||
SocketError.HostUnreachable,
|
||||
SocketError.NetworkUnreachable,
|
||||
];
|
||||
|
||||
/// <summary>
|
||||
/// Builds a retry pipeline for backend (PLC) TCP connect attempts.
|
||||
///
|
||||
/// <para>Retries only on <see cref="SocketException"/> with a
|
||||
/// <see cref="SocketError"/> in <see cref="RetryableSocketErrors"/>. Does NOT retry
|
||||
/// <see cref="ArgumentException"/>, <see cref="OperationCanceledException"/>, or any
|
||||
/// non-network exception.</para>
|
||||
///
|
||||
/// <para>The delay sequence is taken directly from <see cref="RetryProfile.BackoffMs"/>;
|
||||
/// element [i] is the delay before attempt i+1 (0-based). If the attempt index
|
||||
/// exceeds the array, the last element is used.</para>
|
||||
///
|
||||
/// <para>After all attempts are exhausted, the pipeline re-throws the last exception
|
||||
/// so the caller can log <c>mbproxy.backend.failed</c> and close the upstream socket.</para>
|
||||
/// </summary>
|
||||
public static ResiliencePipeline BuildBackendConnect(RetryProfile profile, ILogger logger)
|
||||
{
|
||||
// MaxAttempts in Polly v8 includes the first attempt.
|
||||
int maxAttempts = Math.Max(1, profile.MaxAttempts);
|
||||
var backoffMs = profile.BackoffMs;
|
||||
|
||||
return new ResiliencePipelineBuilder()
|
||||
.AddRetry(new RetryStrategyOptions
|
||||
{
|
||||
MaxRetryAttempts = maxAttempts - 1, // retries = total - 1 (first attempt is free)
|
||||
ShouldHandle = new PredicateBuilder()
|
||||
.Handle<SocketException>(ex => RetryableSocketErrors.Contains(ex.SocketErrorCode)),
|
||||
DelayGenerator = args =>
|
||||
{
|
||||
int idx = args.AttemptNumber; // 0 = first retry, i.e. after attempt 0
|
||||
// Clamp to the last element if we exceed the array.
|
||||
int ms = backoffMs.Count > 0
|
||||
? backoffMs[Math.Min(idx, backoffMs.Count - 1)]
|
||||
: 0;
|
||||
return new ValueTask<TimeSpan?>(TimeSpan.FromMilliseconds(ms));
|
||||
},
|
||||
OnRetry = args =>
|
||||
{
|
||||
logger.LogDebug(
|
||||
"Backend connect retry {Attempt}/{Max}: {Error}",
|
||||
args.AttemptNumber + 1,
|
||||
maxAttempts - 1,
|
||||
args.Outcome.Exception?.Message);
|
||||
return ValueTask.CompletedTask;
|
||||
},
|
||||
})
|
||||
.Build();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Builds an infinite-retry pipeline for listener bind recovery.
|
||||
///
|
||||
/// <para>The delay sequence is:
|
||||
/// <list type="bullet">
|
||||
/// <item><description>Attempts 0 .. (InitialBackoffMs.Length-1) use the initial backoff array.</description></item>
|
||||
/// <item><description>All subsequent attempts use <see cref="RecoveryProfile.SteadyStateMs"/>.</description></item>
|
||||
/// </list>
|
||||
/// The pipeline never exhausts — it retries until the supervisor's cancellation token
|
||||
/// fires (on <see cref="PlcListenerSupervisor.StopAsync"/>).</para>
|
||||
///
|
||||
/// <para>Polly's <c>ExecuteAsync(ct)</c> propagates <see cref="OperationCanceledException"/>
|
||||
/// when <paramref name="ct"/> fires, so the supervisor exits the loop cleanly.</para>
|
||||
/// </summary>
|
||||
public static ResiliencePipeline BuildListenerRecovery(RecoveryProfile profile, ILogger logger)
|
||||
{
|
||||
var initialMs = profile.InitialBackoffMs;
|
||||
int steadyMs = profile.SteadyStateMs;
|
||||
|
||||
return new ResiliencePipelineBuilder()
|
||||
.AddRetry(new RetryStrategyOptions
|
||||
{
|
||||
// int.MaxValue makes the pipeline retry indefinitely; cancellation
|
||||
// is the only exit path (besides the supervisor calling StopAsync).
|
||||
MaxRetryAttempts = int.MaxValue,
|
||||
ShouldHandle = new PredicateBuilder().Handle<Exception>(
|
||||
ex => ex is not OperationCanceledException),
|
||||
DelayGenerator = args =>
|
||||
{
|
||||
// args.AttemptNumber is the zero-based index of the retry
|
||||
// (0 = first retry, after the first failed attempt).
|
||||
int idx = args.AttemptNumber;
|
||||
int ms = idx < initialMs.Count
|
||||
? initialMs[idx]
|
||||
: steadyMs;
|
||||
return new ValueTask<TimeSpan?>(TimeSpan.FromMilliseconds(ms));
|
||||
},
|
||||
OnRetry = args =>
|
||||
{
|
||||
logger.LogDebug(
|
||||
"Listener recovery attempt {Attempt}: {Error}",
|
||||
args.AttemptNumber + 1,
|
||||
args.Outcome.Exception?.Message);
|
||||
return ValueTask.CompletedTask;
|
||||
},
|
||||
})
|
||||
.Build();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,50 @@
|
||||
namespace Mbproxy.Proxy.Supervision;
|
||||
|
||||
/// <summary>
|
||||
/// State machine states for <see cref="PlcListenerSupervisor"/>.
|
||||
/// </summary>
|
||||
public enum SupervisorState
|
||||
{
|
||||
/// <summary>
|
||||
/// The listener is bound and its accept loop is running.
|
||||
/// Entry conditions: <see cref="PlcListener.StartAsync"/> succeeded (on first attempt or
|
||||
/// after a recovery attempt).
|
||||
/// </summary>
|
||||
Bound,
|
||||
|
||||
/// <summary>
|
||||
/// The listener is not bound; the supervisor is waiting for the next Polly retry delay
|
||||
/// before reattempting. Entered after any failed bind (at startup or at runtime).
|
||||
/// </summary>
|
||||
Recovering,
|
||||
|
||||
/// <summary>
|
||||
/// Terminal state. <see cref="PlcListenerSupervisor.StopAsync"/> was called; the supervisor
|
||||
/// task has been cancelled and will not retry.
|
||||
/// </summary>
|
||||
Stopped,
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Immutable point-in-time snapshot of a supervisor's state. Consumed by Phase 07's
|
||||
/// status page via <see cref="PlcListenerSupervisor.Snapshot"/>.
|
||||
///
|
||||
/// <para><b>RecoveryAttempts semantics</b>: this counter <em>accumulates over the lifetime
|
||||
/// of the supervisor</em> and is never reset. Operators reading the status page should
|
||||
/// interpret it as "how many times has this listener faulted or failed to bind since
|
||||
/// the service started" — useful for detecting port-flapping or repeated OS network
|
||||
/// resets. Phase 07 surfaces it as-is.</para>
|
||||
/// </summary>
|
||||
/// <param name="State">Current state of the supervisor.</param>
|
||||
/// <param name="LastBindError">
|
||||
/// Most recent bind failure message (up to 256 chars). <c>null</c> if the listener
|
||||
/// has never failed to bind.
|
||||
/// </param>
|
||||
/// <param name="RecoveryAttempts">
|
||||
/// Total number of failed bind attempts over the lifetime of this supervisor.
|
||||
/// Accumulates; never resets to 0.
|
||||
/// </param>
|
||||
public sealed record SupervisorSnapshot(
|
||||
SupervisorState State,
|
||||
string? LastBindError,
|
||||
int RecoveryAttempts);
|
||||
Reference in New Issue
Block a user