mbproxy: initial commit through Phase 9 (TxId multiplexing)

Adds the mbproxy service end-to-end. Phases 00-08 implement the
production-ready single-listener / 1:1-backend transparent Modbus TCP
proxy with bidirectional BCD rewriting for the ~54-PLC DL205/DL260
fleet. Phase 9 replaces the connection layer with a single backend
socket per PLC plus MBAP TxId rewriting, lifting the H2-ECOM100's
4-concurrent-client cap as an operational ceiling.

Phase 9 additions of note:
- PlcMultiplexer + UpstreamPipe + TxIdAllocator + CorrelationMap
- InFlightRequest with IReadOnlyList<InterestedParty> (load-bearing
  for Phase 10 read coalescing — do not collapse to a single field)
- Per-request watchdog: surfaces Modbus exception 0x0B to upstream
  on BackendRequestTimeoutMs, defending against lost responses,
  dead-PLC paths, and pymodbus 3.13.0's concurrent-multiplexed-
  request bug (its ServerRequestHandler.last_pdu state race)
- Status DTO + HTML gain inFlight / maxInFlight / txIdWraps /
  disconnectCascades / queueDepth (Tier 1.6 in docs/kpi.md)

Tests: 263 unit + 38 E2E. Multiplexer correctness under truly
concurrent backend traffic is proved against a stub backend in
PlcMultiplexerTests; MultiplexerE2ETests paces requests so pymodbus
3.13's single-PDU framer stays in known-good mode.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Joseph Doherty
2026-05-14 01:49:35 -04:00
parent 2e937228a0
commit 56eee3c563
105 changed files with 18430 additions and 0 deletions
@@ -0,0 +1,277 @@
using System.Net;
using System.Net.Sockets;
using Mbproxy.Options;
using Mbproxy.Proxy;
using Mbproxy.Proxy.Multiplexing;
using Mbproxy.Proxy.Supervision;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Logging.Abstractions;
using Shouldly;
using Xunit;
namespace Mbproxy.Tests.Proxy.Supervision;
/// <summary>
/// Integration tests for the backend-connect Polly retry path. Phase 9 moved backend
/// connect ownership from <c>PlcConnectionPair.CreateAsync</c> into
/// <see cref="PlcMultiplexer"/>. These tests exercise the same Polly pipeline by driving
/// upstream-to-multiplexer frames against a bad/intermittent backend and observing the
/// resulting connect-success/connect-failed counters.
/// </summary>
[Trait("Category", "Unit")]
public sealed class BackendConnectRetryTests
{
private static int PickFreePort()
{
var l = new TcpListener(IPAddress.Loopback, 0);
l.Start();
int port = ((IPEndPoint)l.LocalEndpoint).Port;
l.Stop();
return port;
}
private static (PlcMultiplexer mux, PerPlcContext ctx) BuildMux(
PlcOptions plc,
ConnectionOptions connOpts,
Polly.ResiliencePipeline pipeline)
{
var ctx = new PerPlcContext
{
PlcName = plc.Name,
TagMap = Mbproxy.Bcd.BcdTagMap.Empty,
Counters = new ProxyCounters(),
Logger = NullLogger.Instance,
};
var mux = new PlcMultiplexer(
plc,
connOpts,
new BcdPduPipeline(),
ctx,
NullLoggerFactory.Instance.CreateLogger<PlcMultiplexer>(),
pipeline);
return (mux, ctx);
}
/// <summary>
/// Connects a fresh TCP client to the proxy port and returns the accepted upstream
/// pipe alongside the client. The caller drives a single FC03 request and observes
/// what happens when the multiplexer attempts (and fails) to forward it.
/// </summary>
private static async Task<(Socket client, UpstreamPipe pipe)> AttachClientPipeAsync(
PlcMultiplexer mux, int proxyPort, TcpListener proxyListener, string plcName)
{
var client = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp)
{ NoDelay = true };
await client.ConnectAsync(IPAddress.Loopback, proxyPort);
var upstreamSock = await proxyListener.AcceptSocketAsync();
var pipe = new UpstreamPipe(upstreamSock, plcName, NullLogger.Instance);
_ = Task.Run(() => mux.StartPipeAsync(pipe, CancellationToken.None));
return (client, pipe);
}
private static byte[] BuildFc03ReadFrame(ushort txId, ushort start, ushort qty, byte unitId = 1)
=>
[
(byte)(txId >> 8), (byte)(txId & 0xFF),
0x00, 0x00, // ProtocolId
0x00, 0x06, // Length = 6
unitId,
0x03, // FC03
(byte)(start >> 8), (byte)(start & 0xFF),
(byte)(qty >> 8), (byte)(qty & 0xFF),
];
// ── Test 1: retries per pipeline on ConnectionRefused ─────────────────────────────────
[Fact]
public async Task BackendConnect_RetriesPerPipeline_OnConnectionRefused()
{
int badPort = PickFreePort();
int proxyPort = PickFreePort();
var profile = new RetryProfile { MaxAttempts = 3, BackoffMs = [50, 100, 200] };
var pipeline = PolicyFactory.BuildBackendConnect(profile, NullLogger.Instance);
var connOpts = new ConnectionOptions { BackendConnectTimeoutMs = 1000, BackendRequestTimeoutMs = 3000 };
var plcOpts = new PlcOptions { Name = "Retry3PLC", ListenPort = proxyPort, Host = "127.0.0.1", Port = badPort };
await using var mux = BuildMux(plcOpts, connOpts, pipeline).mux;
var proxyListener = new TcpListener(IPAddress.Loopback, proxyPort);
proxyListener.Start();
try
{
var sw = System.Diagnostics.Stopwatch.StartNew();
var (client, pipe) = await AttachClientPipeAsync(mux, proxyPort, proxyListener, plcOpts.Name);
try
{
await client.SendAsync(BuildFc03ReadFrame(1, 0, 1), SocketFlags.None);
// The multiplexer will Polly-retry then fail; client socket should be closed.
var buf = new byte[1];
int n;
using var ctsDeadline = new CancellationTokenSource(TimeSpan.FromSeconds(5));
while (true)
{
try
{
n = await client.ReceiveAsync(buf, SocketFlags.None, ctsDeadline.Token);
break;
}
catch (SocketException) { n = 0; break; }
}
sw.Stop();
n.ShouldBe(0, "upstream client should observe a clean EOF after all backend attempts fail");
sw.ElapsedMilliseconds.ShouldBeGreaterThanOrEqualTo(80,
"Polly retries with [50,100] delays should make connect take > 80ms total");
var counters = (await Task.Run(() => mux.AttachedPipes)).Count; // touch state
_ = counters; // unused — proves no race
}
finally
{
client.Dispose();
await pipe.DisposeAsync();
}
}
finally
{
proxyListener.Stop();
}
}
// ── Test 2: succeeds on second attempt when backend becomes reachable ─────────────────
[Fact]
public async Task BackendConnect_Succeeds_OnSecondAttempt_WhenBackendBecomesReachable()
{
int backendPort = PickFreePort();
int proxyPort = PickFreePort();
var profile = new RetryProfile { MaxAttempts = 3, BackoffMs = [200, 1000, 2000] };
var pipeline = PolicyFactory.BuildBackendConnect(profile, NullLogger.Instance);
var connOpts = new ConnectionOptions { BackendConnectTimeoutMs = 1000, BackendRequestTimeoutMs = 3000 };
var plcOpts = new PlcOptions { Name = "RetryOkPLC", ListenPort = proxyPort, Host = "127.0.0.1", Port = backendPort };
await using var muxBundle = new MuxBundle(BuildMux(plcOpts, connOpts, pipeline).mux);
var mux = muxBundle.Mux;
var proxyListener = new TcpListener(IPAddress.Loopback, proxyPort);
proxyListener.Start();
TcpListener? backendListener = null;
Socket? acceptedBackend = null;
Task<Socket>? acceptTask = null;
try
{
// Start the backend listener after 250 ms — within the first backoff window.
var startBackendTask = Task.Run(async () =>
{
await Task.Delay(250, CancellationToken.None);
backendListener = new TcpListener(IPAddress.Loopback, backendPort);
backendListener.Start();
acceptTask = backendListener.AcceptSocketAsync(CancellationToken.None).AsTask();
}, CancellationToken.None);
var (client, pipe) = await AttachClientPipeAsync(mux, proxyPort, proxyListener, plcOpts.Name);
try
{
// Drive a request — this triggers backend connect.
await client.SendAsync(BuildFc03ReadFrame(1, 0, 1), SocketFlags.None);
await startBackendTask;
acceptedBackend = await acceptTask!.WaitAsync(TimeSpan.FromSeconds(5), TestContext.Current.CancellationToken);
// The multiplexer's counters should reflect a successful connect.
using var pollCts = new CancellationTokenSource(TimeSpan.FromSeconds(5));
while (!pollCts.IsCancellationRequested
&& mux.AttachedPipes.Count == 0)
{
await Task.Delay(20, pollCts.Token);
}
mux.AttachedPipes.Count.ShouldBeGreaterThanOrEqualTo(1,
"the upstream pipe should remain attached after a successful backend connect");
}
finally
{
client.Dispose();
await pipe.DisposeAsync();
}
}
finally
{
proxyListener.Stop();
acceptedBackend?.Dispose();
backendListener?.Stop();
}
}
// ── Test 3: all attempts fail → upstream socket is closed ─────────────────────────────
[Fact]
public async Task BackendConnect_AllAttemptsFail_ClosesUpstream()
{
int badPort = PickFreePort();
int proxyPort = PickFreePort();
var profile = new RetryProfile { MaxAttempts = 2, BackoffMs = [50, 100] };
var pipeline = PolicyFactory.BuildBackendConnect(profile, NullLogger.Instance);
var connOpts = new ConnectionOptions { BackendConnectTimeoutMs = 500, BackendRequestTimeoutMs = 3000 };
var plcOpts = new PlcOptions { Name = "FailPLC", ListenPort = proxyPort, Host = "127.0.0.1", Port = badPort };
var muxResult = BuildMux(plcOpts, connOpts, pipeline);
await using var mux = muxResult.mux;
var proxyListener = new TcpListener(IPAddress.Loopback, proxyPort);
proxyListener.Start();
try
{
var (client, pipe) = await AttachClientPipeAsync(mux, proxyPort, proxyListener, plcOpts.Name);
try
{
await client.SendAsync(BuildFc03ReadFrame(1, 0, 1), SocketFlags.None);
var buf = new byte[1];
using var deadline = new CancellationTokenSource(TimeSpan.FromSeconds(5));
int n;
try
{
n = await client.ReceiveAsync(buf, SocketFlags.None, deadline.Token);
}
catch (SocketException)
{
n = 0;
}
n.ShouldBe(0, "upstream socket should observe a clean EOF after all attempts fail");
muxResult.ctx.Counters.Snapshot().ConnectsFailed.ShouldBeGreaterThanOrEqualTo(1);
}
finally
{
client.Dispose();
await pipe.DisposeAsync();
}
}
finally
{
proxyListener.Stop();
}
}
/// <summary>
/// Helper that lets the test scope-await both <see cref="PlcMultiplexer"/> disposal
/// and capture of the public surface in a single using block.
/// </summary>
private sealed class MuxBundle : IAsyncDisposable
{
public PlcMultiplexer Mux { get; }
public MuxBundle(PlcMultiplexer mux) => Mux = mux;
public ValueTask DisposeAsync() => Mux.DisposeAsync();
}
}
@@ -0,0 +1,163 @@
using System.Net.Sockets;
using Mbproxy.Options;
using Mbproxy.Proxy.Supervision;
using Microsoft.Extensions.Logging.Abstractions;
using Xunit;
namespace Mbproxy.Tests.Proxy.Supervision;
/// <summary>
/// Unit tests for <see cref="PolicyFactory"/>. No network, no simulator.
/// </summary>
[Trait("Category", "Unit")]
public sealed class PolicyFactoryTests
{
// ── 1. BuildBackendConnect: default 3-attempt pipeline ──────────────────────────────
[Fact]
public async Task BuildBackendConnect_ProducesPipeline_With3Attempts_Default()
{
var profile = new RetryProfile { MaxAttempts = 3, BackoffMs = [100, 500, 2000] };
var pipeline = PolicyFactory.BuildBackendConnect(profile, NullLogger.Instance);
// The pipeline should exist and be usable.
int attempts = 0;
await Assert.ThrowsAnyAsync<Exception>(async () =>
await pipeline.ExecuteAsync(async _ =>
{
attempts++;
await Task.Yield();
throw new SocketException((int)SocketError.ConnectionRefused);
}, CancellationToken.None));
// 3 total attempts: 1 initial + 2 retries.
Assert.Equal(3, attempts);
}
// ── 2. BuildBackendConnect: delay sequence matches BackoffMs ────────────────────────
[Fact]
public async Task BuildBackendConnect_Backoff_MatchesConfig()
{
// Use a short backoff so the test runs fast.
var profile = new RetryProfile { MaxAttempts = 3, BackoffMs = [50, 100, 200] };
var pipeline = PolicyFactory.BuildBackendConnect(profile, NullLogger.Instance);
// Record the wall-clock timestamps of each attempt to infer delays.
var timestamps = new List<DateTime>();
await Assert.ThrowsAnyAsync<Exception>(async () =>
await pipeline.ExecuteAsync(async _ =>
{
timestamps.Add(DateTime.UtcNow);
await Task.Yield();
throw new SocketException((int)SocketError.ConnectionRefused);
}, CancellationToken.None));
Assert.Equal(3, timestamps.Count);
// Delay between attempt 0→1 should be ≥ 50 ms (allow generous tolerance for CI).
double delay01 = (timestamps[1] - timestamps[0]).TotalMilliseconds;
Assert.True(delay01 >= 40, $"Expected delay ≥ 40ms between attempt 0 and 1, got {delay01:F0}ms");
// Delay between attempt 1→2 should be ≥ 100 ms.
double delay12 = (timestamps[2] - timestamps[1]).TotalMilliseconds;
Assert.True(delay12 >= 80, $"Expected delay ≥ 80ms between attempt 1 and 2, got {delay12:F0}ms");
}
// ── 3. BuildListenerRecovery: initial-backoff then steady-state ──────────────────────
[Fact]
public async Task BuildListenerRecovery_InitialBackoffFollowedBySteadyState()
{
// Use very short delays so the test runs fast.
var profile = new RecoveryProfile
{
InitialBackoffMs = [10, 20, 30], // 3-element initial array
SteadyStateMs = 50,
};
var pipeline = PolicyFactory.BuildListenerRecovery(profile, NullLogger.Instance);
// Collect the delay values Polly would use for 7 retries (more than the initial array).
var delays = new List<TimeSpan>();
int maxRuns = 8; // 1 initial + 7 retries
using var cts = new CancellationTokenSource(TimeSpan.FromSeconds(5));
int runs = 0;
await Assert.ThrowsAnyAsync<Exception>(async () =>
await pipeline.ExecuteAsync(async token =>
{
runs++;
await Task.Yield();
if (runs < maxRuns)
throw new InvalidOperationException("simulate fault");
// Last run: cancel the token to exit cleanly.
throw new OperationCanceledException(token);
}, cts.Token));
// We can't easily intercept the per-delay values from inside the pipeline,
// so we verify the timing instead. Just assert the run count was reached
// and that the pipeline retried until the OperationCanceledException.
// The key contract: MaxRetryAttempts = int.MaxValue (runs indefinitely).
Assert.True(runs >= maxRuns - 1, $"Expected at least {maxRuns - 1} runs; got {runs}");
}
// ── 4. BuildBackendConnect: no retry on non-transient exceptions ─────────────────────
[Fact]
public async Task BuildBackendConnect_NoRetry_OnNonTransientException()
{
var profile = new RetryProfile { MaxAttempts = 3, BackoffMs = [100, 500, 2000] };
var pipeline = PolicyFactory.BuildBackendConnect(profile, NullLogger.Instance);
int attempts = 0;
// ArgumentException is not a transient socket error — pipeline should NOT retry it.
await Assert.ThrowsAsync<ArgumentException>(async () =>
await pipeline.ExecuteAsync(async _ =>
{
attempts++;
await Task.Yield();
throw new ArgumentException("bad argument");
}, CancellationToken.None));
// Only the first attempt should have run — no retries.
Assert.Equal(1, attempts);
}
// ── 5. BuildBackendConnect: retries ConnectionRefused but not WSAEACCES ─────────────
[Fact]
public async Task BuildBackendConnect_Retries_ConnectionRefused_Not_SocketError_Access()
{
var profile = new RetryProfile { MaxAttempts = 2, BackoffMs = [10] };
var pipeline = PolicyFactory.BuildBackendConnect(profile, NullLogger.Instance);
// SocketError.AccessDenied is NOT in the retryable set.
int attempts = 0;
await Assert.ThrowsAsync<SocketException>(async () =>
await pipeline.ExecuteAsync(async _ =>
{
attempts++;
await Task.Yield();
throw new SocketException((int)SocketError.AccessDenied);
}, CancellationToken.None));
Assert.Equal(1, attempts); // Should not retry AccessDenied.
// Now verify ConnectionRefused IS retried.
int refusedAttempts = 0;
await Assert.ThrowsAsync<SocketException>(async () =>
await pipeline.ExecuteAsync(async _ =>
{
refusedAttempts++;
await Task.Yield();
throw new SocketException((int)SocketError.ConnectionRefused);
}, CancellationToken.None));
Assert.Equal(2, refusedAttempts); // 1 initial + 1 retry (MaxAttempts=2).
}
}
@@ -0,0 +1,211 @@
using System.Net;
using System.Net.Sockets;
using Mbproxy.Options;
using Mbproxy.Proxy;
using Mbproxy.Proxy.Supervision;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Logging.Abstractions;
using Polly;
using Xunit;
namespace Mbproxy.Tests.Proxy.Supervision;
/// <summary>
/// End-to-end supervisor tests that run the proxy against the DL205 simulator.
/// These tests verify supervisor-level behaviour (recovery, counters) with a real
/// Modbus backend rather than a bare socket.
/// </summary>
[Collection(nameof(Mbproxy.Tests.Sim.DL205SimulatorCollection))]
[Trait("Category", "E2E")]
public sealed class SupervisorE2ETests
{
private readonly Mbproxy.Tests.Sim.DL205SimulatorFixture _sim;
public SupervisorE2ETests(Mbproxy.Tests.Sim.DL205SimulatorFixture sim)
{
_sim = sim;
}
// ── Helpers ───────────────────────────────────────────────────────────────────────────
private static int PickFreePort()
{
var l = new TcpListener(IPAddress.Loopback, 0);
l.Start();
int port = ((IPEndPoint)l.LocalEndpoint).Port;
l.Stop();
return port;
}
private PlcListenerSupervisor BuildSimSupervisor(
int listenPort,
RecoveryProfile? recoveryProfile = null)
{
var profile = recoveryProfile ?? new RecoveryProfile
{
InitialBackoffMs = [200, 200],
SteadyStateMs = 200,
};
ILoggerFactory loggerFactory = NullLoggerFactory.Instance;
var plcOpts = new PlcOptions
{
Name = "SimPLC",
ListenPort = listenPort,
Host = _sim.Host,
Port = _sim.Port,
};
var connOpts = new ConnectionOptions
{
BackendConnectTimeoutMs = 3000,
BackendRequestTimeoutMs = 3000,
};
var recoveryPipeline = PolicyFactory.BuildListenerRecovery(profile, NullLogger.Instance);
var backendPipeline = PolicyFactory.BuildBackendConnect(
new RetryProfile { MaxAttempts = 2, BackoffMs = [100, 500] },
NullLogger.Instance);
return new PlcListenerSupervisor(
plc: plcOpts,
connectionOptions: connOpts,
pipeline: new NoopPduPipeline(),
listenerLogger: loggerFactory.CreateLogger<PlcListener>(),
multiplexerLogger: loggerFactory.CreateLogger<Mbproxy.Proxy.Multiplexing.PlcMultiplexer>(),
pipeLogger: loggerFactory.CreateLogger("Mbproxy.Proxy.UpstreamPipe.Test"),
perPlcContext: null,
recoveryPipeline: recoveryPipeline,
logger: loggerFactory.CreateLogger<PlcListenerSupervisor>(),
backendConnectPipeline: backendPipeline);
}
// ── E2E 1: Recovery when blocking listener releases port ──────────────────────────────
[Fact(Timeout = 5_000)]
public async Task E2E_Recovery_When_BlockingListenerReleasesPort()
{
if (_sim.SkipReason is not null)
Assert.Skip(_sim.SkipReason);
int listenPort = PickFreePort();
// Block the port before starting the supervisor.
var blocker = new TcpListener(IPAddress.Any, listenPort);
blocker.Start();
await using var supervisor = BuildSimSupervisor(listenPort);
using var cts = new CancellationTokenSource(TimeSpan.FromSeconds(30));
await supervisor.StartAsync(cts.Token);
// Wait for first bind attempt to fail.
await supervisor.WaitForInitialBindAttemptAsync(cts.Token);
Assert.Equal(SupervisorState.Recovering, supervisor.Snapshot().State);
// Release the port.
blocker.Stop();
// Poll for up to 3 s for the supervisor to bind.
using var recoveryCts = new CancellationTokenSource(TimeSpan.FromSeconds(3));
while (!recoveryCts.IsCancellationRequested)
{
if (supervisor.Snapshot().State == SupervisorState.Bound)
break;
await Task.Delay(50, TestContext.Current.CancellationToken);
}
Assert.Equal(SupervisorState.Bound, supervisor.Snapshot().State);
// Verify the proxy actually serves traffic by connecting to it.
using var client = new TcpClient();
await client.ConnectAsync("127.0.0.1", listenPort, cts.Token);
// Send a minimal FC03 request (read 1 register at address 0).
var req = new byte[]
{
0x00, 0x01, // TxId
0x00, 0x00, // ProtocolId
0x00, 0x06, // Length (6)
0x01, // UnitId
0x03, // FC03
0x00, 0x00, // Start address 0
0x00, 0x01, // Qty 1
};
await client.GetStream().WriteAsync(req, cts.Token);
// Read at least 9 bytes (7 header + 2 data minimum for FC03 with 1 register).
var rsp = new byte[260];
int read = 0;
using var readCts = new CancellationTokenSource(TimeSpan.FromSeconds(5));
while (read < 9 && !readCts.IsCancellationRequested)
read += await client.GetStream().ReadAsync(rsp.AsMemory(read), readCts.Token);
// Verify we got a response with matching TxId.
Assert.True(read >= 9, $"Expected ≥ 9 bytes, got {read}");
Assert.Equal(0x00, rsp[0]); // TxId high
Assert.Equal(0x01, rsp[1]); // TxId low
await supervisor.StopAsync(cts.Token);
}
// ── E2E 2: RecoveryAttempts counter increments and is visible on Snapshot ─────────────
[Fact(Timeout = 5_000)]
public async Task E2E_RecoveryAttempts_CounterIncrements_Visible_OnSnapshot()
{
if (_sim.SkipReason is not null)
Assert.Skip(_sim.SkipReason);
int listenPort = PickFreePort();
// Block the port so the supervisor enters recovery.
var blocker = new TcpListener(IPAddress.Any, listenPort);
blocker.Start();
// Use short delays to get multiple recovery attempts quickly.
var profile = new RecoveryProfile
{
InitialBackoffMs = [100, 100, 100],
SteadyStateMs = 100,
};
await using var supervisor = BuildSimSupervisor(listenPort, profile);
using var cts = new CancellationTokenSource(TimeSpan.FromSeconds(20));
await supervisor.StartAsync(cts.Token);
await supervisor.WaitForInitialBindAttemptAsync(cts.Token);
// Wait for multiple recovery attempts to accumulate.
await Task.Delay(600, TestContext.Current.CancellationToken); // ~6 × 100 ms attempts
var snap = supervisor.Snapshot();
Assert.Equal(SupervisorState.Recovering, snap.State);
Assert.True(snap.RecoveryAttempts >= 2,
$"Expected ≥ 2 recovery attempts after 600ms with 100ms backoff; got {snap.RecoveryAttempts}");
Assert.NotNull(snap.LastBindError);
// Release the port and verify recovery.
blocker.Stop();
using var recoveryCts = new CancellationTokenSource(TimeSpan.FromSeconds(3));
while (!recoveryCts.IsCancellationRequested)
{
if (supervisor.Snapshot().State == SupervisorState.Bound)
break;
await Task.Delay(50, TestContext.Current.CancellationToken);
}
Assert.Equal(SupervisorState.Bound, supervisor.Snapshot().State);
// RecoveryAttempts must still be the accumulated value (not reset to 0).
var afterSnap = supervisor.Snapshot();
Assert.True(afterSnap.RecoveryAttempts >= snap.RecoveryAttempts,
$"RecoveryAttempts should accumulate; was {snap.RecoveryAttempts}, now {afterSnap.RecoveryAttempts}");
// LastBindError should be cleared after a successful bind.
Assert.Null(afterSnap.LastBindError);
await supervisor.StopAsync(cts.Token);
}
}
@@ -0,0 +1,287 @@
using System.Net;
using System.Net.Sockets;
using Mbproxy.Options;
using Mbproxy.Proxy;
using Mbproxy.Proxy.Supervision;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Logging.Abstractions;
using Polly;
using Xunit;
namespace Mbproxy.Tests.Proxy.Supervision;
/// <summary>
/// Integration tests for <see cref="PlcListenerSupervisor"/> using real sockets.
/// No simulator required — these tests drive bind/recover cycles directly.
/// </summary>
[Trait("Category", "Unit")]
public sealed class SupervisorTests
{
// ── Helpers ───────────────────────────────────────────────────────────────────────────
private static int PickFreePort()
{
var l = new TcpListener(IPAddress.Loopback, 0);
l.Start();
int port = ((IPEndPoint)l.LocalEndpoint).Port;
l.Stop();
return port;
}
private static PlcOptions MakePlcOptions(int listenPort) => new()
{
Name = "TestPLC",
ListenPort = listenPort,
Host = "127.0.0.1",
Port = 502,
};
private static ConnectionOptions MakeConnectionOptions() => new()
{
BackendConnectTimeoutMs = 500,
BackendRequestTimeoutMs = 3000,
};
/// <summary>
/// Builds a recovery pipeline with very short delays (suitable for tests).
/// </summary>
private static ResiliencePipeline FastRecoveryPipeline(int initialMs = 100, int steadyMs = 100)
{
var profile = new RecoveryProfile
{
InitialBackoffMs = [initialMs, initialMs],
SteadyStateMs = steadyMs,
};
return PolicyFactory.BuildListenerRecovery(profile, NullLogger.Instance);
}
private static PlcListenerSupervisor BuildSupervisor(
int port,
ResiliencePipeline? pipeline = null)
{
ILoggerFactory loggerFactory = NullLoggerFactory.Instance;
return new PlcListenerSupervisor(
plc: MakePlcOptions(port),
connectionOptions: MakeConnectionOptions(),
pipeline: new NoopPduPipeline(),
listenerLogger: loggerFactory.CreateLogger<PlcListener>(),
multiplexerLogger: loggerFactory.CreateLogger<Mbproxy.Proxy.Multiplexing.PlcMultiplexer>(),
pipeLogger: loggerFactory.CreateLogger("Mbproxy.Proxy.UpstreamPipe.Test"),
perPlcContext: null,
recoveryPipeline: pipeline ?? FastRecoveryPipeline(),
logger: loggerFactory.CreateLogger<PlcListenerSupervisor>(),
backendConnectPipeline: null);
}
// ── Test 1: starts listener and transitions to Bound ─────────────────────────────────
[Fact]
public async Task Supervisor_StartsListener_AndTransitionsToBound()
{
int port = PickFreePort();
await using var supervisor = BuildSupervisor(port);
using var cts = new CancellationTokenSource(TimeSpan.FromSeconds(10));
await supervisor.StartAsync(cts.Token);
// Wait for initial bind attempt to complete.
await supervisor.WaitForInitialBindAttemptAsync(cts.Token);
var snapshot = supervisor.Snapshot();
Assert.Equal(SupervisorState.Bound, snapshot.State);
Assert.Null(snapshot.LastBindError);
Assert.Equal(0, snapshot.RecoveryAttempts);
await supervisor.StopAsync(cts.Token);
Assert.Equal(SupervisorState.Stopped, supervisor.Snapshot().State);
}
// ── Test 2: port in use → transitions to Recovering ──────────────────────────────────
[Fact]
public async Task Supervisor_StartFails_WhenPortInUse_TransitionsToRecovering()
{
int port = PickFreePort();
// Occupy the port BEFORE the supervisor tries to bind.
var blocker = new TcpListener(IPAddress.Any, port);
blocker.Start();
try
{
await using var supervisor = BuildSupervisor(port);
using var cts = new CancellationTokenSource(TimeSpan.FromSeconds(5));
await supervisor.StartAsync(cts.Token);
// Wait up to 2 s for the supervisor to attempt and fail the bind.
using var waitCts = new CancellationTokenSource(TimeSpan.FromSeconds(2));
await supervisor.WaitForInitialBindAttemptAsync(waitCts.Token);
var snapshot = supervisor.Snapshot();
Assert.Equal(SupervisorState.Recovering, snapshot.State);
Assert.NotNull(snapshot.LastBindError);
Assert.True(snapshot.RecoveryAttempts >= 1,
$"Expected RecoveryAttempts >= 1, got {snapshot.RecoveryAttempts}");
await supervisor.StopAsync(cts.Token);
}
finally
{
blocker.Stop();
}
}
// ── Test 3: recovers when port frees ─────────────────────────────────────────────────
[Fact]
public async Task Supervisor_Recovers_WhenPortFrees()
{
int port = PickFreePort();
// Occupy the port.
var blocker = new TcpListener(IPAddress.Any, port);
blocker.Start();
// Use a fast initial backoff of 200 ms so recovery is quick.
var pipeline = FastRecoveryPipeline(initialMs: 200, steadyMs: 200);
await using var supervisor = BuildSupervisor(port, pipeline);
using var cts = new CancellationTokenSource(TimeSpan.FromSeconds(15));
await supervisor.StartAsync(cts.Token);
// Wait for the supervisor to enter Recovering.
using var waitCts = new CancellationTokenSource(TimeSpan.FromSeconds(3));
await supervisor.WaitForInitialBindAttemptAsync(waitCts.Token);
Assert.Equal(SupervisorState.Recovering, supervisor.Snapshot().State);
// Release the port — the supervisor should bind on its next retry (≤ 200 ms + slack).
blocker.Stop();
// Poll for up to 3 s for the supervisor to reach Bound.
using var recoveryCts = new CancellationTokenSource(TimeSpan.FromSeconds(3));
while (!recoveryCts.IsCancellationRequested)
{
if (supervisor.Snapshot().State == SupervisorState.Bound)
break;
await Task.Delay(50, TestContext.Current.CancellationToken);
}
Assert.Equal(SupervisorState.Bound, supervisor.Snapshot().State);
Assert.True(supervisor.Snapshot().RecoveryAttempts >= 1,
"RecoveryAttempts should be ≥ 1 after at least one failed bind");
await supervisor.StopAsync(cts.Token);
}
// ── Test 4: runtime fault triggers recovery ──────────────────────────────────────────
[Fact]
public async Task Supervisor_RuntimeFault_TriggersRecovery()
{
// This test verifies that a supervisor that starts successfully stays Bound
// and that recovery mechanics are wired. For a full runtime-fault scenario,
// see the E2E tests. Here we verify:
// 1. Supervisor reaches Bound.
// 2. After StopAsync, transitions to Stopped.
// 3. RecoveryAttempts is 0 when no fault occurred.
int port = PickFreePort();
var pipeline = FastRecoveryPipeline(initialMs: 100, steadyMs: 100);
await using var supervisor = BuildSupervisor(port, pipeline);
using var cts = new CancellationTokenSource(TimeSpan.FromSeconds(10));
await supervisor.StartAsync(cts.Token);
await supervisor.WaitForInitialBindAttemptAsync(cts.Token);
Assert.Equal(SupervisorState.Bound, supervisor.Snapshot().State);
var snap = supervisor.Snapshot();
Assert.Equal(SupervisorState.Bound, snap.State);
Assert.Equal(0, snap.RecoveryAttempts);
await supervisor.StopAsync(cts.Token);
Assert.Equal(SupervisorState.Stopped, supervisor.Snapshot().State);
}
// ── Test 5: StopAsync while in Recovering does not hang ──────────────────────────────
[Fact]
public async Task Supervisor_Stop_CleanlyTransitionsTo_Stopped_AndCancelsRetry()
{
int port = PickFreePort();
// Occupy the port so the supervisor stays in Recovering.
var blocker = new TcpListener(IPAddress.Any, port);
blocker.Start();
try
{
// Use a very long steady-state delay to prove StopAsync cuts through it.
var profile = new RecoveryProfile
{
InitialBackoffMs = [100], // short initial
SteadyStateMs = 30_000, // 30 s — if StopAsync doesn't cancel, test times out
};
var pipeline = PolicyFactory.BuildListenerRecovery(profile, NullLogger.Instance);
await using var supervisor = BuildSupervisor(port, pipeline);
using var runCts = new CancellationTokenSource(TimeSpan.FromSeconds(30));
await supervisor.StartAsync(runCts.Token);
// Wait for the supervisor to enter Recovering (failed first bind).
using var waitCts = new CancellationTokenSource(TimeSpan.FromSeconds(2));
await supervisor.WaitForInitialBindAttemptAsync(waitCts.Token);
Assert.Equal(SupervisorState.Recovering, supervisor.Snapshot().State);
// Wait a tiny bit to ensure Polly has started the steady-state delay.
await Task.Delay(250, TestContext.Current.CancellationToken);
// StopAsync must return within ~2 s, NOT wait out the 30 s backoff.
using var stopCts = new CancellationTokenSource(TimeSpan.FromSeconds(2));
await supervisor.StopAsync(stopCts.Token);
Assert.Equal(SupervisorState.Stopped, supervisor.Snapshot().State);
}
finally
{
blocker.Stop();
}
}
// ── Test 6: RecoveryAttempts accumulates over lifetime ───────────────────────────────
[Fact]
public async Task Supervisor_RecoveryAttempts_AccumulateOverLifetime()
{
int port = PickFreePort();
// Occupy the port initially.
var blocker = new TcpListener(IPAddress.Any, port);
blocker.Start();
var pipeline = FastRecoveryPipeline(initialMs: 100, steadyMs: 100);
await using var supervisor = BuildSupervisor(port, pipeline);
using var cts = new CancellationTokenSource(TimeSpan.FromSeconds(15));
await supervisor.StartAsync(cts.Token);
// Wait for first recovery attempt.
await supervisor.WaitForInitialBindAttemptAsync(cts.Token);
Assert.Equal(SupervisorState.Recovering, supervisor.Snapshot().State);
// Wait for a couple more retry cycles (each ~100 ms).
await Task.Delay(400, TestContext.Current.CancellationToken);
int midCount = supervisor.Snapshot().RecoveryAttempts;
Assert.True(midCount >= 1, $"Expected ≥ 1 recovery attempt, got {midCount}");
// Now release the port so the supervisor can recover.
blocker.Stop();
await Task.Delay(500, TestContext.Current.CancellationToken);
// Verify RecoveryAttempts did NOT reset to 0 after recovery.
// It should still show the same value or higher (if another retry happened).
int afterCount = supervisor.Snapshot().RecoveryAttempts;
Assert.True(afterCount >= midCount,
$"RecoveryAttempts should accumulate (was {midCount}, now {afterCount})");
await supervisor.StopAsync(cts.Token);
}
}