mbproxy: Wave 1 fixes from 2026-05-14 code review

Resolves the four critical correctness defects + the ShutdownCoordinator
double-stop ordering bug called out in codereviews/2026-05-14/Overview.md.
Tests: 362 pass / 0 fail (baseline 358 + 4 new W1 regression tests).

W1.1 — Context swap on running multiplexer.
  PlcMultiplexer._ctx becomes volatile with a new ReplaceContext() method
  that re-registers the cache stats provider on the (preserved) counters.
  PlcListener exposes its multiplexer; PlcListenerSupervisor.ReplaceContextAsync
  swaps the running mux first, then disposes the old cache. Hot-reload
  tag-list changes and the cache-flush-on-reload contract now actually take
  effect on the next PDU instead of waiting for the next listener fault.

W1.2 — Coalescing factory leak.
  When the InFlightByKey factory soft-fails (allocator saturation or duplicate
  TxId), the cleanup path now TryRemoves the stub and walks every party on it
  (including late attachers) to deliver Modbus exception 0x04. Previously
  only the leader got the exception; late attachers waited forever for a
  response that no backend round-trip would ever fire.

W1.3 — Backend-reader head-of-line block.
  UpstreamPipe gains TrySendResponse for non-blocking enqueue. The per-PLC
  backend reader's fan-out loop uses it instead of awaiting SendResponseAsync,
  so a wedged upstream's full bounded response channel can no longer stall
  the single backend reader and starve every other client on that PLC. New
  responseDropForFullUpstream counter on ProxyCounters / CounterSnapshot
  records the drops.

W1.4 — Stranded outbound frames after cascade.
  TearDownBackendAsync acquires _connectGate and drains any frames left in
  _outboundChannel after the writer task faulted/cancelled, releasing their
  proxy TxIds back to the allocator. Without this, a fresh
  EnsureBackendConnectedAsync racing the cascade would send stranded frames
  with old TxIds onto the new backend socket; the responses would arrive
  with no correlation entry and the upstream peers would hang on the
  watchdog until BackendRequestTimeoutMs.

W1.5 — Delete ShutdownCoordinator (Option B).
  Drain logic moved into ProxyWorker.StopAsync. AdminEndpointHost is no
  longer registered as IHostedService; ProxyWorker drives its lifecycle
  directly so admin starts after listeners are bound and stops AFTER the
  in-flight drain (the design's documented contract). Admin is resolved
  lazily in ExecuteAsync to break the circular DI graph
  (Admin -> StatusSnapshotBuilder -> ProxyWorker). GracefulShutdownTimeoutMs
  is now read fresh from IOptionsMonitor.CurrentValue at stop time, so a
  hot-reloaded value is honoured. Removes ShutdownCoordinator + tests.

New tests:
  PlcMultiplexerTests.ReplaceContext_NewTagMap_VisibleOnNextPdu
  PlcMultiplexerTests.ReplaceContext_NewCache_NextReadGoesToBackend_NotOldCache
  UpstreamPipeTests.TrySendResponse_WhenChannelFull_ReturnsFalse_WithoutBlocking
  UpstreamPipeTests.TrySendResponse_AfterDispose_ReturnsFalse

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Joseph Doherty
2026-05-14 05:16:13 -04:00
parent f2c6669444
commit ce32c5cee8
14 changed files with 614 additions and 532 deletions
+113 -3
View File
@@ -1,3 +1,5 @@
using System.Diagnostics;
using Mbproxy.Admin;
using Mbproxy.Bcd;
using Mbproxy.Configuration;
using Mbproxy.Options;
@@ -5,7 +7,6 @@ using Mbproxy.Proxy.Cache;
using Mbproxy.Proxy.Multiplexing;
using Mbproxy.Proxy.Supervision;
using Microsoft.Extensions.Options;
using Polly;
namespace Mbproxy.Proxy;
@@ -34,6 +35,16 @@ internal sealed partial class ProxyWorker : BackgroundService
private readonly ILogger<ProxyWorker> _logger;
private readonly ILoggerFactory _loggerFactory;
private readonly ConfigReconciler _reconciler;
// Phase 12 (W1.5) — admin endpoint is no longer IHostedService; ProxyWorker drives its
// lifecycle directly so the design's "drain THEN stop admin" ordering is honoured.
//
// Resolved LAZILY (in ExecuteAsync) rather than in the constructor because the DI graph
// is circular: AdminEndpointHost → StatusSnapshotBuilder → ProxyWorker. A constructor
// GetService<AdminEndpointHost>() during ProxyWorker's own construction returns null
// silently. Lazy resolution sidesteps the cycle — by the time ExecuteAsync runs the DI
// container is fully built.
private readonly IServiceProvider _services;
private AdminEndpointHost? _admin;
// Phase 06: supervisors are now managed jointly by ProxyWorker (initial bootstrap)
// and ConfigReconciler (subsequent hot-reload changes). The dictionary is shared
@@ -52,13 +63,16 @@ internal sealed partial class ProxyWorker : BackgroundService
IPduPipeline pipeline,
ILogger<ProxyWorker> logger,
ILoggerFactory loggerFactory,
ConfigReconciler reconciler)
ConfigReconciler reconciler,
IServiceProvider services)
{
_options = options;
_pipeline = pipeline;
_logger = logger;
_loggerFactory = loggerFactory;
_reconciler = reconciler;
_services = services;
// Phase 12 (W1.5) — admin endpoint resolved lazily in ExecuteAsync (see field comment).
}
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
@@ -188,17 +202,58 @@ internal sealed partial class ProxyWorker : BackgroundService
int boundCount = _supervisors.Values.Count(s => s.Snapshot().State == SupervisorState.Bound);
LogStartupReady(_logger, boundCount, plcsConfigured);
// Phase 12 (W1.5) — start the admin endpoint AFTER listeners are bound so the
// status page can never observe the service in a "no PLCs configured yet" state.
// The admin endpoint is no longer registered as IHostedService (the host's reverse
// stop order would tear it down BEFORE drain). ProxyWorker drives both ends.
//
// Resolution happens here, not in the constructor — the DI graph is circular
// (admin → StatusSnapshotBuilder → ProxyWorker) and a constructor-time lookup
// returns null silently.
_admin = _services.GetService<AdminEndpointHost>();
if (_admin is not null)
{
try
{
await _admin.StartAsync(stoppingToken).ConfigureAwait(false);
}
catch (Exception ex)
{
_logger.LogError(ex, "Admin endpoint failed to start: {Message}", ex.Message);
}
}
// ── 6. Keep the worker alive until the host signals stop ─────────────────────
// Supervisors run their own background loops; ExecuteAsync just waits.
await Task.Delay(Timeout.Infinite, stoppingToken).ConfigureAwait(false);
}
/// <summary>
/// Phase 12 (W1.5) — graceful shutdown sequence (replaces the deleted
/// <c>ShutdownCoordinator</c>):
/// <list type="number">
/// <item>Cancel <see cref="ExecuteAsync"/> via <c>base.StopAsync</c>.</item>
/// <item>Stop all supervisors with a 5 s hard deadline (no new connections; existing
/// pipes are cascaded by <see cref="PlcListenerSupervisor"/> teardown).</item>
/// <item>Wait for in-flight PDUs to drain via the live
/// <see cref="ConnectionOptions.GracefulShutdownTimeoutMs"/> (read fresh from
/// <see cref="IOptionsMonitor{T}.CurrentValue"/> so a hot-reloaded value is
/// honoured at stop time).</item>
/// <item>Stop the admin endpoint LAST so the status page survives the drain phase
/// and an operator polling it sees the in-flight count fall to zero.</item>
/// <item>Dispose every supervisor to release sockets, channels, and watchdog timers.</item>
/// </list>
/// Logs <c>mbproxy.shutdown.complete</c> on the way out with the in-flight count at
/// drain-deadline (zero on a clean shutdown, positive when forced cancel).
/// </summary>
public override async Task StopAsync(CancellationToken cancellationToken)
{
// Cancel ExecuteAsync first.
await base.StopAsync(cancellationToken).ConfigureAwait(false);
// Stop all supervisors in parallel with a 5-second hard deadline.
var sw = Stopwatch.StartNew();
// ── 1. Stop accepting new connections ─────────────────────────────────────────
using var stopCts = new CancellationTokenSource(TimeSpan.FromSeconds(5));
using var linked = CancellationTokenSource.CreateLinkedTokenSource(
stopCts.Token, cancellationToken);
@@ -216,10 +271,59 @@ internal sealed partial class ProxyWorker : BackgroundService
// Best effort — don't let individual supervisor failures block shutdown.
}
// ── 2. Drain in-flight PDUs ───────────────────────────────────────────────────
// Reads the current configured deadline so a hot-reloaded
// GracefulShutdownTimeoutMs is honoured at stop time, not frozen at process start.
int drainDeadlineMs = _options.CurrentValue.Connection.GracefulShutdownTimeoutMs;
int inFlightAtCancel = 0;
if (drainDeadlineMs > 0)
{
using var drainCts = new CancellationTokenSource(TimeSpan.FromMilliseconds(drainDeadlineMs));
try
{
while (!drainCts.Token.IsCancellationRequested)
{
int total = CountInFlight();
if (total == 0) break;
await Task.Delay(10, drainCts.Token).ConfigureAwait(false);
}
}
catch (OperationCanceledException)
{
inFlightAtCancel = CountInFlight();
}
}
// ── 3. Stop admin endpoint LAST ───────────────────────────────────────────────
if (_admin is not null)
{
try
{
using var adminCts = new CancellationTokenSource(TimeSpan.FromSeconds(2));
await _admin.StopAsync(adminCts.Token).ConfigureAwait(false);
}
catch
{
// Best-effort.
}
}
// ── 4. Dispose supervisors (releases sockets, channels, watchdog timers) ─────
foreach (var supervisor in _supervisors.Values)
await supervisor.DisposeAsync().ConfigureAwait(false);
_supervisors.Clear();
LogShutdownComplete(_logger, inFlightAtCancel, sw.ElapsedMilliseconds);
}
private int CountInFlight()
{
int total = 0;
foreach (var supervisor in _supervisors.Values)
total += (int)supervisor.CurrentCounters.Snapshot().InFlightCount;
return total;
}
// ── Logging ───────────────────────────────────────────────────────────────────────────
@@ -247,4 +351,10 @@ internal sealed partial class ProxyWorker : BackgroundService
Level = LogLevel.Error,
Message = "Failed to bind listener: Plc={Plc} Port={Port} Reason={Reason}")]
private static partial void LogBindFailed(ILogger logger, string plc, int port, string reason);
// Phase 12 (W1.5) — moved here from the deleted ShutdownCoordinator.
[LoggerMessage(EventId = 80, EventName = "mbproxy.shutdown.complete",
Level = LogLevel.Information,
Message = "Graceful shutdown complete: InFlightAtCancel={InFlightAtCancel} ElapsedMs={ElapsedMs}")]
private static partial void LogShutdownComplete(ILogger logger, int inFlightAtCancel, long elapsedMs);
}