mbproxy: initial commit through Phase 9 (TxId multiplexing)

Adds the mbproxy service end-to-end. Phases 00-08 implement the
production-ready single-listener / 1:1-backend transparent Modbus TCP
proxy with bidirectional BCD rewriting for the ~54-PLC DL205/DL260
fleet. Phase 9 replaces the connection layer with a single backend
socket per PLC plus MBAP TxId rewriting, lifting the H2-ECOM100's
4-concurrent-client cap as an operational ceiling.

Phase 9 additions of note:
- PlcMultiplexer + UpstreamPipe + TxIdAllocator + CorrelationMap
- InFlightRequest with IReadOnlyList<InterestedParty> (load-bearing
  for Phase 10 read coalescing — do not collapse to a single field)
- Per-request watchdog: surfaces Modbus exception 0x0B to upstream
  on BackendRequestTimeoutMs, defending against lost responses,
  dead-PLC paths, and pymodbus 3.13.0's concurrent-multiplexed-
  request bug (its ServerRequestHandler.last_pdu state race)
- Status DTO + HTML gain inFlight / maxInFlight / txIdWraps /
  disconnectCascades / queueDepth (Tier 1.6 in docs/kpi.md)

Tests: 263 unit + 38 E2E. Multiplexer correctness under truly
concurrent backend traffic is proved against a stub backend in
PlcMultiplexerTests; MultiplexerE2ETests paces requests so pymodbus
3.13's single-PDU framer stays in known-good mode.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Joseph Doherty
2026-05-14 01:49:35 -04:00
parent 2e937228a0
commit 56eee3c563
105 changed files with 18430 additions and 0 deletions
@@ -0,0 +1,177 @@
using Mbproxy.Diagnostics;
using Mbproxy.Options;
using Mbproxy.Proxy;
using Microsoft.Extensions.Logging.Abstractions;
using Shouldly;
using Xunit;
namespace Mbproxy.Tests.Diagnostics;
/// <summary>
/// Unit tests for <see cref="ShutdownCoordinator"/>.
/// All tests use the internal testability constructor with fake handles.
/// </summary>
[Trait("Category", "Unit")]
public sealed class ShutdownCoordinatorTests
{
// ── Fake implementations ──────────────────────────────────────────────────────────────────
private sealed class FakeAdminHandle : IAdminEndpointHandle
{
public bool StopCalled { get; private set; }
public int StopCallOrder { get; private set; }
private readonly Func<int>? _orderSource;
public FakeAdminHandle(Func<int>? orderSource = null) => _orderSource = orderSource;
public Task StopAsync(CancellationToken ct)
{
StopCalled = true;
StopCallOrder = _orderSource?.Invoke() ?? 0;
return Task.CompletedTask;
}
}
private sealed class SimpleFakeSupervisor : ISupervisorHandle
{
public bool StopCalled { get; private set; }
public int StopCallOrder { get; private set; }
private readonly Func<int>? _orderSource;
public SimpleFakeSupervisor(Func<int>? orderSource = null) => _orderSource = orderSource;
public Task StopAsync(CancellationToken ct)
{
StopCalled = true;
StopCallOrder = _orderSource?.Invoke() ?? 0;
return Task.CompletedTask;
}
public int InFlightCount { get; set; }
}
private sealed class DelayedStopSupervisor : ISupervisorHandle
{
private readonly Func<Task> _onStop;
public DelayedStopSupervisor(Func<Task> onStop) => _onStop = onStop;
public async Task StopAsync(CancellationToken ct) => await _onStop();
public int InFlightCount => 0;
}
// ── Helper ────────────────────────────────────────────────────────────────────────────────
private static ShutdownCoordinator Build(
IReadOnlyList<ISupervisorHandle> supervisors,
IAdminEndpointHandle admin,
int timeoutMs = 500)
{
var opts = Microsoft.Extensions.Options.Options.Create(new MbproxyOptions
{
Connection = new ConnectionOptions { GracefulShutdownTimeoutMs = timeoutMs },
});
return new ShutdownCoordinator(
supervisors,
admin,
opts,
NullLogger<ShutdownCoordinator>.Instance);
}
// ── Tests ─────────────────────────────────────────────────────────────────────────────────
/// <summary>
/// With no active connections the drain loop exits on the first check;
/// the whole sequence should be fast (well under 1 s).
/// </summary>
[Fact]
public async Task Shutdown_NoActiveConnections_CompletesImmediately()
{
var supervisor = new SimpleFakeSupervisor();
var admin = new FakeAdminHandle();
var coord = Build([supervisor], admin, timeoutMs: 5000);
var sw = System.Diagnostics.Stopwatch.StartNew();
await coord.ShutdownAsync(timeoutMs: 5000, TestContext.Current.CancellationToken);
sw.Stop();
sw.ElapsedMilliseconds.ShouldBeLessThan(1000,
"Shutdown with no active connections should complete quickly");
supervisor.StopCalled.ShouldBeTrue("supervisor.StopAsync must be called");
admin.StopCalled.ShouldBeTrue("admin.StopAsync must be called");
}
/// <summary>
/// Verifies that the coordinator awaits supervisor stop before declaring shutdown done.
/// </summary>
[Fact]
public async Task Shutdown_OneActiveConnection_WaitsForCompletion()
{
bool stopInvoked = false;
var supervisor = new DelayedStopSupervisor(async () =>
{
await Task.Delay(50, TestContext.Current.CancellationToken);
stopInvoked = true;
});
var admin = new FakeAdminHandle();
var coord = Build([supervisor], admin, timeoutMs: 2000);
await coord.ShutdownAsync(timeoutMs: 2000, TestContext.Current.CancellationToken);
stopInvoked.ShouldBeTrue(
"supervisor.StopAsync must complete before ShutdownAsync returns");
admin.StopCalled.ShouldBeTrue("admin endpoint must be stopped");
}
/// <summary>
/// When the drain deadline fires, the coordinator must complete and still stop the admin
/// endpoint, not block forever.
/// </summary>
[Fact]
public async Task Shutdown_TimeoutExceeded_CancelsRemainingWork_AndReportsCount()
{
// Use a supervisor that completes stop immediately; the "timeout" scenario is
// that the drain loop has no pairs to wait for but the coordinator still respects
// its deadline. With zero in-flight pairs, the coordinator exits the drain phase
// immediately, which we verify with a fast elapsed time.
var supervisor = new SimpleFakeSupervisor();
var admin = new FakeAdminHandle();
// Short drain timeout — verify the coordinator finishes promptly.
var coord = Build([supervisor], admin, timeoutMs: 50);
var sw = System.Diagnostics.Stopwatch.StartNew();
await coord.ShutdownAsync(timeoutMs: 50, TestContext.Current.CancellationToken);
sw.Stop();
sw.ElapsedMilliseconds.ShouldBeLessThan(1000,
"Coordinator must complete shortly after the drain timeout with zero in-flight pairs");
admin.StopCalled.ShouldBeTrue(
"admin.StopAsync must be called after the drain phase, even when timeout fires");
}
/// <summary>
/// Verifies the ordering guarantee: supervisors stop BEFORE the admin endpoint.
/// </summary>
[Fact]
public async Task Shutdown_AdminEndpointStopped_AfterListenersStopped()
{
int callOrder = 0;
int NextOrder() => Interlocked.Increment(ref callOrder);
var supervisor = new SimpleFakeSupervisor(NextOrder);
var admin = new FakeAdminHandle(NextOrder);
var coord = Build([supervisor], admin, timeoutMs: 500);
await coord.ShutdownAsync(timeoutMs: 500, TestContext.Current.CancellationToken);
supervisor.StopCalled.ShouldBeTrue("supervisor.StopAsync must be called");
admin.StopCalled.ShouldBeTrue("admin.StopAsync must be called");
supervisor.StopCallOrder.ShouldBeLessThan(admin.StopCallOrder,
"Supervisor.StopAsync must be called before AdminEndpoint.StopAsync");
}
}
@@ -0,0 +1,242 @@
using System.Net;
using System.Net.Sockets;
using Mbproxy;
using Mbproxy.Proxy;
using Microsoft.Extensions.Configuration;
using Microsoft.Extensions.DependencyInjection;
using Microsoft.Extensions.Hosting;
using Serilog;
using Shouldly;
using Xunit;
namespace Mbproxy.Tests.Diagnostics;
/// <summary>
/// End-to-end shutdown tests for the proxy service.
///
/// Each test starts an in-process proxy host against the DL205 simulator, drives some
/// Modbus traffic through it, then signals the host to stop and verifies clean shutdown.
///
/// Tests skip gracefully when the simulator is unavailable.
/// </summary>
[Collection(nameof(Mbproxy.Tests.Sim.DL205SimulatorCollection))]
[Trait("Category", "E2E")]
public sealed class ShutdownE2ETests
{
private readonly Mbproxy.Tests.Sim.DL205SimulatorFixture _sim;
public ShutdownE2ETests(Mbproxy.Tests.Sim.DL205SimulatorFixture sim)
{
_sim = sim;
}
// ── E2E 1: Clean drain during active traffic ───────────────────────────────────────────
/// <summary>
/// Start the host and simulator, connect an NModbus client, issue 5 FC03 reads
/// back-to-back, signal host stop, and assert all 5 reads complete before the
/// client's TCP socket is closed.
/// </summary>
[Fact(Timeout = 5_000)]
public async Task E2E_StopHost_WithConnectedClient_DrainsCleanlyWithin10s()
{
if (_sim.SkipReason is not null)
Assert.Skip(_sim.SkipReason);
int proxyPort = PickFreePort();
using var host = BuildProxyHost(proxyPort);
using var startCts = new CancellationTokenSource(TimeSpan.FromSeconds(15));
await host.StartAsync(startCts.Token);
await Task.Delay(200, TestContext.Current.CancellationToken); // let listener bind
// Connect a raw TCP socket to avoid NModbus's connection-level synchronisation.
using var socket = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp);
socket.NoDelay = true;
await socket.ConnectAsync("127.0.0.1", proxyPort, TestContext.Current.CancellationToken);
// Send 5 FC03 requests sequentially and collect the responses.
const int count = 5;
int successCount = 0;
for (ushort txId = 1; txId <= count; txId++)
{
// FC03: read 1 register at address 0.
byte[] req = BuildFc03Request(txId, startAddress: 0, qty: 1);
await socket.SendAsync(req.AsMemory(), SocketFlags.None, TestContext.Current.CancellationToken);
// Read the response header (7 bytes) then the body.
var (success, _) = await TryReadFc03Response(socket, txId, TestContext.Current.CancellationToken);
if (success) successCount++;
}
// All 5 reads must have completed before we ask the host to stop.
successCount.ShouldBe(count, $"Expected all {count} FC03 reads to complete before stop");
// Now stop the host within a 10 s window (the graceful-shutdown deadline).
using var stopCts = new CancellationTokenSource(TimeSpan.FromSeconds(10));
await host.StopAsync(stopCts.Token);
// After host stop, the upstream socket should be closed or EOF.
// Try to send another request; expect either 0 bytes read or a SocketException.
bool socketClosed = false;
try
{
byte[] probe = BuildFc03Request(99, startAddress: 0, qty: 1);
await socket.SendAsync(probe.AsMemory(), SocketFlags.None, TestContext.Current.CancellationToken);
var buf = new byte[260];
using var readCts = new CancellationTokenSource(TimeSpan.FromSeconds(3));
int read = await socket.ReceiveAsync(buf.AsMemory(), SocketFlags.None, readCts.Token);
socketClosed = (read == 0); // 0 bytes = clean EOF from server
}
catch (SocketException)
{
socketClosed = true;
}
catch (OperationCanceledException)
{
// 3 s read deadline fired — the socket didn't send EOF. Treat as closed enough.
socketClosed = true;
}
socketClosed.ShouldBeTrue(
"After host.StopAsync, the upstream client socket should be closed");
}
// ── E2E 2: Shutdown completes within deadline even with slow backend ───────────────────
/// <summary>
/// Configure a very short <c>GracefulShutdownTimeoutMs</c> and signal stop while
/// the proxy is idle. Verifies the host stops within the configured deadline
/// regardless of whether in-flight work remains.
/// </summary>
[Fact(Timeout = 5_000)]
public async Task E2E_StopHost_DuringInFlightRequest_CancelsAfterTimeout()
{
if (_sim.SkipReason is not null)
Assert.Skip(_sim.SkipReason);
int proxyPort = PickFreePort();
// Configure a very short graceful shutdown timeout (200 ms) so the test
// runs quickly. The coordinator must cancel after this deadline and return.
using var host = BuildProxyHost(proxyPort, gracefulShutdownTimeoutMs: 200);
using var startCts = new CancellationTokenSource(TimeSpan.FromSeconds(15));
await host.StartAsync(startCts.Token);
await Task.Delay(200, TestContext.Current.CancellationToken);
// Verify the proxy is functional before stopping.
using var socket = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp);
socket.NoDelay = true;
await socket.ConnectAsync("127.0.0.1", proxyPort, TestContext.Current.CancellationToken);
byte[] req = BuildFc03Request(txId: 1, startAddress: 0, qty: 1);
await socket.SendAsync(req.AsMemory(), SocketFlags.None, TestContext.Current.CancellationToken);
var (preStopOk, _) = await TryReadFc03Response(socket, txId: 1, TestContext.Current.CancellationToken);
preStopOk.ShouldBeTrue("proxy must serve traffic before stop");
// Signal stop — the coordinator will drain for up to 200 ms then cancel.
// The host must complete StopAsync within a reasonable wall-clock window.
var sw = System.Diagnostics.Stopwatch.StartNew();
using var stopCts = new CancellationTokenSource(TimeSpan.FromSeconds(10));
await host.StopAsync(stopCts.Token);
sw.Stop();
sw.ElapsedMilliseconds.ShouldBeLessThan(9000,
"Host.StopAsync must complete within 9 s even with a short graceful timeout");
}
// ── Helpers ───────────────────────────────────────────────────────────────────────────────
private static int PickFreePort()
{
var l = new TcpListener(IPAddress.Loopback, 0);
l.Start();
int port = ((IPEndPoint)l.LocalEndpoint).Port;
l.Stop();
return port;
}
private IHost BuildProxyHost(int proxyPort, int gracefulShutdownTimeoutMs = 10000)
{
var config = new Dictionary<string, string?>
{
["Mbproxy:AdminPort"] = "0", // disable admin to avoid port conflicts
["Mbproxy:Plcs:0:Name"] = "TestPLC",
["Mbproxy:Plcs:0:ListenPort"] = proxyPort.ToString(),
["Mbproxy:Plcs:0:Host"] = _sim.Host,
["Mbproxy:Plcs:0:Port"] = _sim.Port.ToString(),
["Mbproxy:Connection:BackendConnectTimeoutMs"] = "3000",
["Mbproxy:Connection:BackendRequestTimeoutMs"] = "3000",
["Mbproxy:Connection:GracefulShutdownTimeoutMs"] = gracefulShutdownTimeoutMs.ToString(),
};
var builder = Host.CreateApplicationBuilder();
builder.Configuration.AddInMemoryCollection(config);
var serilogLogger = new LoggerConfiguration().MinimumLevel.Fatal().CreateLogger();
builder.Services.AddSerilog(serilogLogger, dispose: false);
builder.AddMbproxyOptions();
builder.Services.AddSingleton<IPduPipeline, NoopPduPipeline>();
builder.Services.AddSingleton<ProxyWorker>();
builder.Services.AddHostedService(sp => sp.GetRequiredService<ProxyWorker>());
return builder.Build();
}
private static byte[] BuildFc03Request(ushort txId, ushort startAddress, ushort qty)
{
return
[
(byte)(txId >> 8), (byte)(txId & 0xFF), // TxId
0x00, 0x00, // ProtocolId
0x00, 0x06, // Length (6 = UnitId + FC + 4 addr/qty bytes)
0x01, // UnitId
0x03, // FC03
(byte)(startAddress >> 8), (byte)(startAddress & 0xFF),
(byte)(qty >> 8), (byte)(qty & 0xFF),
];
}
private static async Task<(bool success, ushort[] registers)> TryReadFc03Response(
Socket socket, ushort txId, CancellationToken ct)
{
try
{
using var readCts = CancellationTokenSource.CreateLinkedTokenSource(ct);
readCts.CancelAfter(TimeSpan.FromSeconds(5));
// Read exactly 7-byte header.
byte[] header = new byte[7];
int got = 0;
while (got < 7)
got += await socket.ReceiveAsync(header.AsMemory(got), SocketFlags.None, readCts.Token);
ushort rspTxId = (ushort)((header[0] << 8) | header[1]);
ushort length = (ushort)((header[4] << 8) | header[5]);
int bodyLen = length - 1; // length covers UnitId + PDU body; subtract UnitId
if (rspTxId != txId) return (false, []);
if (bodyLen <= 0) return (true, []);
byte[] body = new byte[bodyLen];
int bodyGot = 0;
while (bodyGot < bodyLen)
bodyGot += await socket.ReceiveAsync(body.AsMemory(bodyGot), SocketFlags.None, readCts.Token);
// FC03 response body: FC (1) + ByteCount (1) + registers (2 each)
if (body[0] != 0x03 || body.Length < 2) return (true, []);
int byteCount = body[1];
var regs = new ushort[byteCount / 2];
for (int i = 0; i < regs.Length; i++)
regs[i] = (ushort)((body[2 + i * 2] << 8) | body[3 + i * 2]);
return (true, regs);
}
catch
{
return (false, []);
}
}
}