mbproxy: Wave 1 fixes from 2026-05-14 code review
Resolves the four critical correctness defects + the ShutdownCoordinator double-stop ordering bug called out in codereviews/2026-05-14/Overview.md. Tests: 362 pass / 0 fail (baseline 358 + 4 new W1 regression tests). W1.1 — Context swap on running multiplexer. PlcMultiplexer._ctx becomes volatile with a new ReplaceContext() method that re-registers the cache stats provider on the (preserved) counters. PlcListener exposes its multiplexer; PlcListenerSupervisor.ReplaceContextAsync swaps the running mux first, then disposes the old cache. Hot-reload tag-list changes and the cache-flush-on-reload contract now actually take effect on the next PDU instead of waiting for the next listener fault. W1.2 — Coalescing factory leak. When the InFlightByKey factory soft-fails (allocator saturation or duplicate TxId), the cleanup path now TryRemoves the stub and walks every party on it (including late attachers) to deliver Modbus exception 0x04. Previously only the leader got the exception; late attachers waited forever for a response that no backend round-trip would ever fire. W1.3 — Backend-reader head-of-line block. UpstreamPipe gains TrySendResponse for non-blocking enqueue. The per-PLC backend reader's fan-out loop uses it instead of awaiting SendResponseAsync, so a wedged upstream's full bounded response channel can no longer stall the single backend reader and starve every other client on that PLC. New responseDropForFullUpstream counter on ProxyCounters / CounterSnapshot records the drops. W1.4 — Stranded outbound frames after cascade. TearDownBackendAsync acquires _connectGate and drains any frames left in _outboundChannel after the writer task faulted/cancelled, releasing their proxy TxIds back to the allocator. Without this, a fresh EnsureBackendConnectedAsync racing the cascade would send stranded frames with old TxIds onto the new backend socket; the responses would arrive with no correlation entry and the upstream peers would hang on the watchdog until BackendRequestTimeoutMs. W1.5 — Delete ShutdownCoordinator (Option B). Drain logic moved into ProxyWorker.StopAsync. AdminEndpointHost is no longer registered as IHostedService; ProxyWorker drives its lifecycle directly so admin starts after listeners are bound and stops AFTER the in-flight drain (the design's documented contract). Admin is resolved lazily in ExecuteAsync to break the circular DI graph (Admin -> StatusSnapshotBuilder -> ProxyWorker). GracefulShutdownTimeoutMs is now read fresh from IOptionsMonitor.CurrentValue at stop time, so a hot-reloaded value is honoured. Removes ShutdownCoordinator + tests. New tests: PlcMultiplexerTests.ReplaceContext_NewTagMap_VisibleOnNextPdu PlcMultiplexerTests.ReplaceContext_NewCache_NextReadGoesToBackend_NotOldCache UpstreamPipeTests.TrySendResponse_WhenChannelFull_ReturnsFalse_WithoutBlocking UpstreamPipeTests.TrySendResponse_AfterDispose_ReturnsFalse Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1,177 +0,0 @@
|
||||
using Mbproxy.Diagnostics;
|
||||
using Mbproxy.Options;
|
||||
using Mbproxy.Proxy;
|
||||
using Microsoft.Extensions.Logging.Abstractions;
|
||||
using Shouldly;
|
||||
using Xunit;
|
||||
|
||||
namespace Mbproxy.Tests.Diagnostics;
|
||||
|
||||
/// <summary>
|
||||
/// Unit tests for <see cref="ShutdownCoordinator"/>.
|
||||
/// All tests use the internal testability constructor with fake handles.
|
||||
/// </summary>
|
||||
[Trait("Category", "Unit")]
|
||||
public sealed class ShutdownCoordinatorTests
|
||||
{
|
||||
// ── Fake implementations ──────────────────────────────────────────────────────────────────
|
||||
|
||||
private sealed class FakeAdminHandle : IAdminEndpointHandle
|
||||
{
|
||||
public bool StopCalled { get; private set; }
|
||||
public int StopCallOrder { get; private set; }
|
||||
private readonly Func<int>? _orderSource;
|
||||
|
||||
public FakeAdminHandle(Func<int>? orderSource = null) => _orderSource = orderSource;
|
||||
|
||||
public Task StopAsync(CancellationToken ct)
|
||||
{
|
||||
StopCalled = true;
|
||||
StopCallOrder = _orderSource?.Invoke() ?? 0;
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
}
|
||||
|
||||
private sealed class SimpleFakeSupervisor : ISupervisorHandle
|
||||
{
|
||||
public bool StopCalled { get; private set; }
|
||||
public int StopCallOrder { get; private set; }
|
||||
private readonly Func<int>? _orderSource;
|
||||
|
||||
public SimpleFakeSupervisor(Func<int>? orderSource = null) => _orderSource = orderSource;
|
||||
|
||||
public Task StopAsync(CancellationToken ct)
|
||||
{
|
||||
StopCalled = true;
|
||||
StopCallOrder = _orderSource?.Invoke() ?? 0;
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
|
||||
public int InFlightCount { get; set; }
|
||||
}
|
||||
|
||||
private sealed class DelayedStopSupervisor : ISupervisorHandle
|
||||
{
|
||||
private readonly Func<Task> _onStop;
|
||||
public DelayedStopSupervisor(Func<Task> onStop) => _onStop = onStop;
|
||||
public async Task StopAsync(CancellationToken ct) => await _onStop();
|
||||
public int InFlightCount => 0;
|
||||
}
|
||||
|
||||
// ── Helper ────────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
private static ShutdownCoordinator Build(
|
||||
IReadOnlyList<ISupervisorHandle> supervisors,
|
||||
IAdminEndpointHandle admin,
|
||||
int timeoutMs = 500)
|
||||
{
|
||||
var opts = Microsoft.Extensions.Options.Options.Create(new MbproxyOptions
|
||||
{
|
||||
Connection = new ConnectionOptions { GracefulShutdownTimeoutMs = timeoutMs },
|
||||
});
|
||||
|
||||
return new ShutdownCoordinator(
|
||||
supervisors,
|
||||
admin,
|
||||
opts,
|
||||
NullLogger<ShutdownCoordinator>.Instance);
|
||||
}
|
||||
|
||||
// ── Tests ─────────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
/// <summary>
|
||||
/// With no active connections the drain loop exits on the first check;
|
||||
/// the whole sequence should be fast (well under 1 s).
|
||||
/// </summary>
|
||||
[Fact]
|
||||
public async Task Shutdown_NoActiveConnections_CompletesImmediately()
|
||||
{
|
||||
var supervisor = new SimpleFakeSupervisor();
|
||||
var admin = new FakeAdminHandle();
|
||||
var coord = Build([supervisor], admin, timeoutMs: 5000);
|
||||
|
||||
var sw = System.Diagnostics.Stopwatch.StartNew();
|
||||
await coord.ShutdownAsync(timeoutMs: 5000, TestContext.Current.CancellationToken);
|
||||
sw.Stop();
|
||||
|
||||
sw.ElapsedMilliseconds.ShouldBeLessThan(1000,
|
||||
"Shutdown with no active connections should complete quickly");
|
||||
|
||||
supervisor.StopCalled.ShouldBeTrue("supervisor.StopAsync must be called");
|
||||
admin.StopCalled.ShouldBeTrue("admin.StopAsync must be called");
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Verifies that the coordinator awaits supervisor stop before declaring shutdown done.
|
||||
/// </summary>
|
||||
[Fact]
|
||||
public async Task Shutdown_OneActiveConnection_WaitsForCompletion()
|
||||
{
|
||||
bool stopInvoked = false;
|
||||
|
||||
var supervisor = new DelayedStopSupervisor(async () =>
|
||||
{
|
||||
await Task.Delay(50, TestContext.Current.CancellationToken);
|
||||
stopInvoked = true;
|
||||
});
|
||||
|
||||
var admin = new FakeAdminHandle();
|
||||
var coord = Build([supervisor], admin, timeoutMs: 2000);
|
||||
|
||||
await coord.ShutdownAsync(timeoutMs: 2000, TestContext.Current.CancellationToken);
|
||||
|
||||
stopInvoked.ShouldBeTrue(
|
||||
"supervisor.StopAsync must complete before ShutdownAsync returns");
|
||||
admin.StopCalled.ShouldBeTrue("admin endpoint must be stopped");
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// When the drain deadline fires, the coordinator must complete and still stop the admin
|
||||
/// endpoint, not block forever.
|
||||
/// </summary>
|
||||
[Fact]
|
||||
public async Task Shutdown_TimeoutExceeded_CancelsRemainingWork_AndReportsCount()
|
||||
{
|
||||
// Use a supervisor that completes stop immediately; the "timeout" scenario is
|
||||
// that the drain loop has no pairs to wait for but the coordinator still respects
|
||||
// its deadline. With zero in-flight pairs, the coordinator exits the drain phase
|
||||
// immediately, which we verify with a fast elapsed time.
|
||||
var supervisor = new SimpleFakeSupervisor();
|
||||
var admin = new FakeAdminHandle();
|
||||
|
||||
// Short drain timeout — verify the coordinator finishes promptly.
|
||||
var coord = Build([supervisor], admin, timeoutMs: 50);
|
||||
|
||||
var sw = System.Diagnostics.Stopwatch.StartNew();
|
||||
await coord.ShutdownAsync(timeoutMs: 50, TestContext.Current.CancellationToken);
|
||||
sw.Stop();
|
||||
|
||||
sw.ElapsedMilliseconds.ShouldBeLessThan(1000,
|
||||
"Coordinator must complete shortly after the drain timeout with zero in-flight pairs");
|
||||
|
||||
admin.StopCalled.ShouldBeTrue(
|
||||
"admin.StopAsync must be called after the drain phase, even when timeout fires");
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Verifies the ordering guarantee: supervisors stop BEFORE the admin endpoint.
|
||||
/// </summary>
|
||||
[Fact]
|
||||
public async Task Shutdown_AdminEndpointStopped_AfterListenersStopped()
|
||||
{
|
||||
int callOrder = 0;
|
||||
int NextOrder() => Interlocked.Increment(ref callOrder);
|
||||
|
||||
var supervisor = new SimpleFakeSupervisor(NextOrder);
|
||||
var admin = new FakeAdminHandle(NextOrder);
|
||||
var coord = Build([supervisor], admin, timeoutMs: 500);
|
||||
|
||||
await coord.ShutdownAsync(timeoutMs: 500, TestContext.Current.CancellationToken);
|
||||
|
||||
supervisor.StopCalled.ShouldBeTrue("supervisor.StopAsync must be called");
|
||||
admin.StopCalled.ShouldBeTrue("admin.StopAsync must be called");
|
||||
|
||||
supervisor.StopCallOrder.ShouldBeLessThan(admin.StopCallOrder,
|
||||
"Supervisor.StopAsync must be called before AdminEndpoint.StopAsync");
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user