mbproxy: initial commit through Phase 9 (TxId multiplexing)
Adds the mbproxy service end-to-end. Phases 00-08 implement the production-ready single-listener / 1:1-backend transparent Modbus TCP proxy with bidirectional BCD rewriting for the ~54-PLC DL205/DL260 fleet. Phase 9 replaces the connection layer with a single backend socket per PLC plus MBAP TxId rewriting, lifting the H2-ECOM100's 4-concurrent-client cap as an operational ceiling. Phase 9 additions of note: - PlcMultiplexer + UpstreamPipe + TxIdAllocator + CorrelationMap - InFlightRequest with IReadOnlyList<InterestedParty> (load-bearing for Phase 10 read coalescing — do not collapse to a single field) - Per-request watchdog: surfaces Modbus exception 0x0B to upstream on BackendRequestTimeoutMs, defending against lost responses, dead-PLC paths, and pymodbus 3.13.0's concurrent-multiplexed- request bug (its ServerRequestHandler.last_pdu state race) - Status DTO + HTML gain inFlight / maxInFlight / txIdWraps / disconnectCascades / queueDepth (Tier 1.6 in docs/kpi.md) Tests: 263 unit + 38 E2E. Multiplexer correctness under truly concurrent backend traffic is proved against a stub backend in PlcMultiplexerTests; MultiplexerE2ETests paces requests so pymodbus 3.13's single-PDU framer stays in known-good mode. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,664 @@
|
||||
using System.Collections.Concurrent;
|
||||
using System.Diagnostics;
|
||||
using System.Net.Sockets;
|
||||
using System.Threading.Channels;
|
||||
using Mbproxy.Options;
|
||||
using Polly;
|
||||
|
||||
namespace Mbproxy.Proxy.Multiplexing;
|
||||
|
||||
/// <summary>
|
||||
/// Owner of the single backend TCP connection to one PLC. Multiplexes many
|
||||
/// <see cref="UpstreamPipe"/> instances onto that one socket by rewriting MBAP transaction
|
||||
/// IDs so concurrent in-flight requests from different upstream clients remain
|
||||
/// distinguishable on the shared wire. The multiplexer:
|
||||
///
|
||||
/// <list type="bullet">
|
||||
/// <item><description>Opens and re-opens the backend socket through a Polly retry pipeline
|
||||
/// that matches the <see cref="ResilienceOptions.BackendConnect"/> profile.</description></item>
|
||||
/// <item><description>Runs one backend writer task that drains <see cref="_outboundChannel"/>
|
||||
/// into the backend socket (single writer; no socket-level synchronisation needed).</description></item>
|
||||
/// <item><description>Runs one backend reader task that decodes MBAP frames from the backend,
|
||||
/// looks each frame up in the <see cref="CorrelationMap"/>, restores each interested
|
||||
/// party's original TxId, and hands the frame to that party's
|
||||
/// <see cref="UpstreamPipe._responseChannel"/>.</description></item>
|
||||
/// <item><description>Cascades a backend disconnect by closing every attached pipe and
|
||||
/// freeing every allocated proxy TxId, then waits for the next upstream request to
|
||||
/// arrive (which triggers a fresh backend connect via Polly).</description></item>
|
||||
/// </list>
|
||||
///
|
||||
/// <para><b>Threading invariants:</b> a single backend writer touches the backend socket
|
||||
/// for sends; a single backend reader touches the same socket for receives. Per-upstream
|
||||
/// read tasks call <see cref="OnUpstreamFrameAsync"/>, which allocates a proxy TxId, queues
|
||||
/// the request frame into <see cref="_outboundChannel"/>, and returns. Upstream-side writes
|
||||
/// flow through each pipe's response channel — never directly through this class.</para>
|
||||
///
|
||||
/// <para><b>Lifecycle:</b> the multiplexer is created with the backend offline. The first
|
||||
/// <see cref="OnUpstreamFrameAsync"/> call (or the first <see cref="Attach"/> if you prefer
|
||||
/// eager-start) triggers backend connect through the Polly pipeline. Subsequent in-flight
|
||||
/// requests reuse the same socket. <see cref="DisposeAsync"/> tears down the backend
|
||||
/// socket, the writer/reader tasks, and every attached pipe.</para>
|
||||
/// </summary>
|
||||
internal sealed class PlcMultiplexer : IAsyncDisposable, IMultiplexCountersProvider
|
||||
{
|
||||
private const int OutboundChannelCapacity = 256;
|
||||
|
||||
private readonly PlcOptions _plc;
|
||||
private readonly ConnectionOptions _connectionOptions;
|
||||
private readonly IPduPipeline _pipeline;
|
||||
private readonly PerPlcContext _ctx;
|
||||
private readonly ILogger<PlcMultiplexer> _logger;
|
||||
private readonly ResiliencePipeline? _backendConnectPipeline;
|
||||
|
||||
private readonly TxIdAllocator _allocator = new();
|
||||
private readonly CorrelationMap _correlation = new();
|
||||
|
||||
private readonly Channel<byte[]> _outboundChannel = Channel.CreateBounded<byte[]>(
|
||||
new BoundedChannelOptions(OutboundChannelCapacity)
|
||||
{
|
||||
FullMode = BoundedChannelFullMode.Wait,
|
||||
SingleReader = true,
|
||||
SingleWriter = false,
|
||||
});
|
||||
|
||||
// Attached pipes — Phase 9 needs the list for the status page; Phase 10 will need it for
|
||||
// coalescing (fan-out). ConcurrentDictionary keyed on UpstreamPipe.Id for O(1) detach.
|
||||
private readonly ConcurrentDictionary<Guid, UpstreamPipe> _pipes = new();
|
||||
|
||||
// Lifecycle plumbing. Backend tasks share a CTS; cascading disconnect cancels it,
|
||||
// which terminates both the writer and reader tasks. The next call to
|
||||
// EnsureBackendConnectedAsync constructs a fresh CTS and a fresh backend socket.
|
||||
private readonly object _backendLock = new();
|
||||
private Socket? _backendSocket;
|
||||
private CancellationTokenSource? _backendCts;
|
||||
private Task? _backendWriterTask;
|
||||
private Task? _backendReaderTask;
|
||||
|
||||
private readonly CancellationTokenSource _disposeCts = new();
|
||||
private bool _disposed;
|
||||
private Task? _watchdogTask;
|
||||
|
||||
public PlcMultiplexer(
|
||||
PlcOptions plc,
|
||||
ConnectionOptions connectionOptions,
|
||||
IPduPipeline pipeline,
|
||||
PerPlcContext perPlcContext,
|
||||
ILogger<PlcMultiplexer> logger,
|
||||
ResiliencePipeline? backendConnectPipeline = null)
|
||||
{
|
||||
_plc = plc;
|
||||
_connectionOptions = connectionOptions;
|
||||
_pipeline = pipeline;
|
||||
_ctx = perPlcContext;
|
||||
_logger = logger;
|
||||
_backendConnectPipeline = backendConnectPipeline;
|
||||
|
||||
// Register this multiplexer as the live telemetry source for the PLC's counters.
|
||||
_ctx.Counters.SetMultiplexProvider(this);
|
||||
|
||||
// Spin up the per-request timeout watchdog. It scans the correlation map at a fixed
|
||||
// interval and times out any in-flight request older than BackendRequestTimeoutMs.
|
||||
// Critical for: lost responses, dead-PLC paths, and backends that mis-echo TxIds
|
||||
// (e.g. pymodbus 3.13.0's concurrent-multiplexed-request bug — see test files).
|
||||
_watchdogTask = Task.Run(() => RunRequestTimeoutWatchdogAsync(_disposeCts.Token), CancellationToken.None);
|
||||
}
|
||||
|
||||
// ── IMultiplexCountersProvider ────────────────────────────────────────────
|
||||
|
||||
public long InFlightCount => _allocator.InFlightCount;
|
||||
public long TxIdWraps => _allocator.WrapCount;
|
||||
public long BackendQueueDepth => _outboundChannel.Reader.Count;
|
||||
|
||||
// ── Public surface ────────────────────────────────────────────────────────
|
||||
|
||||
/// <summary>
|
||||
/// Read-only collection of currently-attached upstream pipes. Used by the status page.
|
||||
/// </summary>
|
||||
public IReadOnlyCollection<UpstreamPipe> AttachedPipes => _pipes.Values.ToArray();
|
||||
|
||||
/// <summary>
|
||||
/// Attaches an upstream pipe to this multiplexer. The caller is responsible for
|
||||
/// running the pipe's read+write loops (typically via <see cref="StartPipeAsync"/>)
|
||||
/// which wires the pipe's OnFrame callback back into <see cref="OnUpstreamFrameAsync"/>.
|
||||
/// </summary>
|
||||
public void Attach(UpstreamPipe pipe)
|
||||
{
|
||||
if (_disposed)
|
||||
throw new ObjectDisposedException(nameof(PlcMultiplexer));
|
||||
|
||||
_pipes[pipe.Id] = pipe;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Starts the read+write tasks for <paramref name="pipe"/> and returns a task that
|
||||
/// completes when the pipe's read loop ends. The multiplexer detaches the pipe when
|
||||
/// its read loop returns.
|
||||
/// </summary>
|
||||
public Task StartPipeAsync(UpstreamPipe pipe, CancellationToken ct)
|
||||
{
|
||||
Attach(pipe);
|
||||
|
||||
// The write loop runs to completion when the pipe is disposed or the channel
|
||||
// completes. We don't await it directly — it's joined inside DisposeAsync of the pipe.
|
||||
_ = Task.Run(() => pipe.RunWriteLoopAsync(ct), CancellationToken.None);
|
||||
|
||||
var readLoop = pipe.RunReadLoopAsync(
|
||||
(frame, frameCt) => OnUpstreamFrameAsync(pipe, frame, frameCt),
|
||||
ct);
|
||||
|
||||
// When the pipe's read loop finishes, detach it. Don't dispose it here; the
|
||||
// listener (or the cascade walker) owns disposal.
|
||||
_ = readLoop.ContinueWith(prev =>
|
||||
{
|
||||
_pipes.TryRemove(pipe.Id, out _);
|
||||
}, TaskScheduler.Default);
|
||||
|
||||
return readLoop;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Tears down the multiplexer: closes the backend connection, cancels both backend
|
||||
/// tasks, drains every in-flight correlation entry, and closes every attached pipe.
|
||||
/// </summary>
|
||||
public async ValueTask DisposeAsync()
|
||||
{
|
||||
if (_disposed) return;
|
||||
_disposed = true;
|
||||
|
||||
// Stop the counters provider link so a status snapshot during teardown doesn't
|
||||
// see live-but-soon-to-be-empty internal state.
|
||||
_ctx.Counters.SetMultiplexProvider(null);
|
||||
|
||||
await _disposeCts.CancelAsync().ConfigureAwait(false);
|
||||
|
||||
// Best-effort join the watchdog so its in-flight log/dispatch settles before tests
|
||||
// assert on counter state.
|
||||
if (_watchdogTask is not null)
|
||||
{
|
||||
try { await _watchdogTask.WaitAsync(TimeSpan.FromSeconds(2)).ConfigureAwait(false); }
|
||||
catch { /* swallow */ }
|
||||
}
|
||||
|
||||
await TearDownBackendAsync("disposing", cascadeUpstreams: true).ConfigureAwait(false);
|
||||
_outboundChannel.Writer.TryComplete();
|
||||
|
||||
// Dispose all attached pipes.
|
||||
foreach (var pipe in _pipes.Values)
|
||||
{
|
||||
try { await pipe.DisposeAsync().ConfigureAwait(false); } catch { /* best effort */ }
|
||||
}
|
||||
_pipes.Clear();
|
||||
|
||||
_disposeCts.Dispose();
|
||||
}
|
||||
|
||||
// ── Backend connect / teardown ────────────────────────────────────────────
|
||||
|
||||
private async Task<bool> EnsureBackendConnectedAsync(CancellationToken ct)
|
||||
{
|
||||
if (_disposed) return false;
|
||||
|
||||
// Fast path: already connected.
|
||||
if (_backendSocket is { Connected: true } && _backendCts is { IsCancellationRequested: false })
|
||||
return true;
|
||||
|
||||
// Serialise concurrent connect attempts from many upstream pipes.
|
||||
await _connectGate.WaitAsync(ct).ConfigureAwait(false);
|
||||
try
|
||||
{
|
||||
// Re-check after acquiring the gate.
|
||||
if (_backendSocket is { Connected: true } && _backendCts is { IsCancellationRequested: false })
|
||||
return true;
|
||||
|
||||
// Build a fresh backend socket and Polly-connect.
|
||||
var backend = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp)
|
||||
{ NoDelay = true };
|
||||
|
||||
try
|
||||
{
|
||||
if (_backendConnectPipeline is not null)
|
||||
{
|
||||
await _backendConnectPipeline.ExecuteAsync(async attemptToken =>
|
||||
{
|
||||
using var cts = CancellationTokenSource.CreateLinkedTokenSource(attemptToken);
|
||||
cts.CancelAfter(_connectionOptions.BackendConnectTimeoutMs);
|
||||
await backend.ConnectAsync(_plc.Host, _plc.Port, cts.Token).ConfigureAwait(false);
|
||||
}, ct).ConfigureAwait(false);
|
||||
}
|
||||
else
|
||||
{
|
||||
using var connectCts = CancellationTokenSource.CreateLinkedTokenSource(ct);
|
||||
connectCts.CancelAfter(_connectionOptions.BackendConnectTimeoutMs);
|
||||
await backend.ConnectAsync(_plc.Host, _plc.Port, connectCts.Token).ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
string reason = ex is OperationCanceledException
|
||||
? $"Backend connect timed out or cancelled after {_connectionOptions.BackendConnectTimeoutMs} ms"
|
||||
: ex.Message;
|
||||
MultiplexerLogEvents.BackendFailed(_logger, _plc.Name, reason);
|
||||
_ctx.Counters.IncrementConnectFailed();
|
||||
backend.Dispose();
|
||||
return false;
|
||||
}
|
||||
|
||||
// Successful connect. Wire up the backend tasks.
|
||||
var cts2 = CancellationTokenSource.CreateLinkedTokenSource(_disposeCts.Token);
|
||||
lock (_backendLock)
|
||||
{
|
||||
_backendSocket = backend;
|
||||
_backendCts = cts2;
|
||||
_backendWriterTask = Task.Run(() => RunBackendWriterAsync(backend, cts2.Token), CancellationToken.None);
|
||||
_backendReaderTask = Task.Run(() => RunBackendReaderAsync(backend, cts2.Token), CancellationToken.None);
|
||||
}
|
||||
|
||||
_ctx.Counters.IncrementConnectSuccess();
|
||||
MultiplexerLogEvents.BackendConnected(_logger, _plc.Name, _plc.Host, _plc.Port);
|
||||
return true;
|
||||
}
|
||||
finally
|
||||
{
|
||||
_connectGate.Release();
|
||||
}
|
||||
}
|
||||
|
||||
private readonly SemaphoreSlim _connectGate = new(1, 1);
|
||||
|
||||
private async Task TearDownBackendAsync(string reason, bool cascadeUpstreams)
|
||||
{
|
||||
Socket? oldSocket;
|
||||
CancellationTokenSource? oldCts;
|
||||
Task? writer, reader;
|
||||
lock (_backendLock)
|
||||
{
|
||||
oldSocket = _backendSocket;
|
||||
oldCts = _backendCts;
|
||||
writer = _backendWriterTask;
|
||||
reader = _backendReaderTask;
|
||||
|
||||
_backendSocket = null;
|
||||
_backendCts = null;
|
||||
_backendWriterTask = null;
|
||||
_backendReaderTask = null;
|
||||
}
|
||||
|
||||
if (oldSocket is null && oldCts is null) return;
|
||||
|
||||
try { oldCts?.Cancel(); } catch { /* best effort */ }
|
||||
|
||||
try { oldSocket?.Shutdown(SocketShutdown.Both); } catch { /* already closed */ }
|
||||
try { oldSocket?.Dispose(); } catch { /* best effort */ }
|
||||
|
||||
// Drain correlation map; cascade-close every interested upstream pipe.
|
||||
var dropped = _correlation.DrainAll();
|
||||
var cascadeIds = new HashSet<Guid>();
|
||||
|
||||
foreach (var kvp in dropped)
|
||||
{
|
||||
_allocator.Release(kvp.Key);
|
||||
foreach (var party in kvp.Value.InterestedParties)
|
||||
cascadeIds.Add(party.Pipe.Id);
|
||||
}
|
||||
|
||||
int upstreamCount = 0;
|
||||
if (cascadeUpstreams)
|
||||
{
|
||||
// Close every attached pipe that had a request in flight; the others will
|
||||
// simply re-issue on next request through a fresh backend connect.
|
||||
// Per the design doc, ALL attached upstreams cascade on backend disconnect.
|
||||
upstreamCount = _pipes.Count;
|
||||
|
||||
// Snapshot keys before disposal modifies the dictionary indirectly.
|
||||
var pipeList = _pipes.Values.ToArray();
|
||||
foreach (var pipe in pipeList)
|
||||
{
|
||||
try { await pipe.DisposeAsync().ConfigureAwait(false); }
|
||||
catch { /* best effort */ }
|
||||
}
|
||||
_pipes.Clear();
|
||||
|
||||
_ctx.Counters.AddDisconnectCascades(upstreamCount);
|
||||
}
|
||||
|
||||
// Best-effort join.
|
||||
try { if (writer is not null) await writer.WaitAsync(TimeSpan.FromSeconds(2)).ConfigureAwait(false); } catch { /* swallow */ }
|
||||
try { if (reader is not null) await reader.WaitAsync(TimeSpan.FromSeconds(2)).ConfigureAwait(false); } catch { /* swallow */ }
|
||||
|
||||
oldCts?.Dispose();
|
||||
|
||||
if (upstreamCount > 0 || dropped.Count > 0)
|
||||
MultiplexerLogEvents.BackendDisconnected(_logger, _plc.Name, upstreamCount, dropped.Count, reason);
|
||||
}
|
||||
|
||||
// ── Backend writer / reader tasks ─────────────────────────────────────────
|
||||
|
||||
private async Task RunBackendWriterAsync(Socket backend, CancellationToken ct)
|
||||
{
|
||||
try
|
||||
{
|
||||
await foreach (var frame in _outboundChannel.Reader.ReadAllAsync(ct).ConfigureAwait(false))
|
||||
{
|
||||
int sent = 0;
|
||||
while (sent < frame.Length)
|
||||
{
|
||||
int n = await backend.SendAsync(
|
||||
frame.AsMemory(sent, frame.Length - sent),
|
||||
SocketFlags.None,
|
||||
ct).ConfigureAwait(false);
|
||||
if (n == 0) throw new SocketException((int)SocketError.ConnectionReset);
|
||||
sent += n;
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (OperationCanceledException)
|
||||
{
|
||||
// Normal teardown.
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
// Backend failure — cascade.
|
||||
_ = TearDownBackendAsync($"writer fault: {ex.Message}", cascadeUpstreams: true);
|
||||
}
|
||||
}
|
||||
|
||||
private async Task RunBackendReaderAsync(Socket backend, CancellationToken ct)
|
||||
{
|
||||
byte[] headerBuf = new byte[MbapFrame.HeaderSize];
|
||||
try
|
||||
{
|
||||
while (!ct.IsCancellationRequested)
|
||||
{
|
||||
if (!await FillAsync(backend, headerBuf, 0, MbapFrame.HeaderSize, ct).ConfigureAwait(false))
|
||||
break;
|
||||
|
||||
if (!MbapFrame.TryParseHeader(headerBuf.AsSpan(),
|
||||
out ushort proxyTxId, out _, out ushort length, out _))
|
||||
break;
|
||||
|
||||
if (length < 1)
|
||||
{
|
||||
// Degenerate frame — drop.
|
||||
continue;
|
||||
}
|
||||
|
||||
int pduBodyLen = length - 1;
|
||||
if (pduBodyLen > MbapFrame.MaxPduBodySize)
|
||||
{
|
||||
// Frame too large — backend is misbehaving; force teardown.
|
||||
_logger.LogWarning(
|
||||
"Oversized backend frame: Plc={Plc} PduBody={Body} > Max={Max}",
|
||||
_plc.Name, pduBodyLen, MbapFrame.MaxPduBodySize);
|
||||
break;
|
||||
}
|
||||
|
||||
byte[] frame = new byte[MbapFrame.HeaderSize + pduBodyLen];
|
||||
Buffer.BlockCopy(headerBuf, 0, frame, 0, MbapFrame.HeaderSize);
|
||||
|
||||
if (!await FillAsync(backend, frame, MbapFrame.HeaderSize, pduBodyLen, ct).ConfigureAwait(false))
|
||||
break;
|
||||
|
||||
if (!_correlation.TryRemove(proxyTxId, out var inFlight))
|
||||
{
|
||||
// No correlation entry — either a stale response after cascade, or
|
||||
// the PLC sent something unsolicited. Drop the frame.
|
||||
continue;
|
||||
}
|
||||
|
||||
// Free the allocator slot immediately so it can be reused.
|
||||
_allocator.Release(proxyTxId);
|
||||
|
||||
// Update EWMA round-trip from when we sent the request.
|
||||
long elapsedMs = (DateTimeOffset.UtcNow - inFlight.SentAtUtc).Ticks * 100; // 100 ns per tick
|
||||
// UpdateRoundTripEwma expects Stopwatch ticks, but we have wall-clock.
|
||||
// Convert ms back to Stopwatch ticks:
|
||||
long ticks = (long)((double)(DateTimeOffset.UtcNow - inFlight.SentAtUtc).TotalSeconds * Stopwatch.Frequency);
|
||||
if (ticks > 0)
|
||||
_ctx.Counters.UpdateRoundTripEwma(ticks);
|
||||
|
||||
// Apply the BCD rewriter on the response. Build a per-call context clone
|
||||
// that carries CurrentRequest so the rewriter can decode FC03/04 slots.
|
||||
var responseCtx = _ctx.WithCurrentRequest(inFlight);
|
||||
_pipeline.Process(
|
||||
MbapDirection.ResponseToClient,
|
||||
frame.AsSpan(0, MbapFrame.HeaderSize),
|
||||
frame.AsSpan(MbapFrame.HeaderSize, pduBodyLen),
|
||||
responseCtx);
|
||||
|
||||
// Fan out to each interested party with their original TxId restored.
|
||||
// Phase 9: always exactly one party. Phase 10: N parties (read coalescing).
|
||||
foreach (var party in inFlight.InterestedParties)
|
||||
{
|
||||
if (!party.Pipe.IsAlive)
|
||||
continue;
|
||||
|
||||
// The frame buffer is private to this iteration; if there are multiple
|
||||
// parties (Phase 10), each gets its own copy with its own original TxId
|
||||
// patched in. Phase 9 always has Count == 1, so the single-buffer path
|
||||
// is the common case; we copy to keep Phase-10 forward compatibility.
|
||||
byte[] outFrame = inFlight.InterestedParties.Count == 1
|
||||
? frame
|
||||
: (byte[])frame.Clone();
|
||||
|
||||
outFrame[0] = (byte)(party.OriginalTxId >> 8);
|
||||
outFrame[1] = (byte)(party.OriginalTxId & 0xFF);
|
||||
|
||||
await party.Pipe.SendResponseAsync(outFrame, ct).ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
|
||||
// Reader exited cleanly — backend closed by remote. Cascade.
|
||||
_ = TearDownBackendAsync("backend reader EOF", cascadeUpstreams: true);
|
||||
}
|
||||
catch (OperationCanceledException)
|
||||
{
|
||||
// Normal teardown.
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_ = TearDownBackendAsync($"reader fault: {ex.Message}", cascadeUpstreams: true);
|
||||
}
|
||||
}
|
||||
|
||||
// ── Upstream → multiplexer entry point ────────────────────────────────────
|
||||
|
||||
private async ValueTask OnUpstreamFrameAsync(UpstreamPipe pipe, byte[] frame, CancellationToken ct)
|
||||
{
|
||||
if (_disposed) return;
|
||||
|
||||
// Ensure backend is connected. Failure here means we cannot service the request;
|
||||
// close the upstream pipe (consistent with the 1:1 model's behaviour on connect
|
||||
// failure).
|
||||
if (!await EnsureBackendConnectedAsync(ct).ConfigureAwait(false))
|
||||
{
|
||||
try { await pipe.DisposeAsync().ConfigureAwait(false); } catch { /* best effort */ }
|
||||
return;
|
||||
}
|
||||
|
||||
if (frame.Length < MbapFrame.HeaderSize)
|
||||
return;
|
||||
|
||||
if (!MbapFrame.TryParseHeader(frame.AsSpan(0, MbapFrame.HeaderSize),
|
||||
out ushort originalTxId, out _, out _, out byte unitId))
|
||||
return;
|
||||
|
||||
if (!_allocator.TryAllocate(out ushort proxyTxId))
|
||||
{
|
||||
MultiplexerLogEvents.Saturated(_logger, _plc.Name, pipe.RemoteEp?.ToString() ?? "?");
|
||||
// Synthesize Modbus exception 04 (Slave Device Failure).
|
||||
byte fc = frame.Length > MbapFrame.HeaderSize ? frame[MbapFrame.HeaderSize] : (byte)0;
|
||||
byte[] excFrame = BuildExceptionFrame(originalTxId, unitId, fc, exceptionCode: 4);
|
||||
await pipe.SendResponseAsync(excFrame, ct).ConfigureAwait(false);
|
||||
return;
|
||||
}
|
||||
|
||||
// Parse the PDU FC + start/qty (for FC03/04) so the response decoder has the
|
||||
// correlation it needs.
|
||||
int pduOffset = MbapFrame.HeaderSize;
|
||||
byte fcByte = frame[pduOffset];
|
||||
ushort startAddr = 0;
|
||||
ushort qty = 0;
|
||||
if (fcByte is 0x03 or 0x04 && frame.Length >= pduOffset + 5)
|
||||
{
|
||||
startAddr = (ushort)((frame[pduOffset + 1] << 8) | frame[pduOffset + 2]);
|
||||
qty = (ushort)((frame[pduOffset + 3] << 8) | frame[pduOffset + 4]);
|
||||
}
|
||||
|
||||
var inFlight = new InFlightRequest(
|
||||
UnitId: unitId,
|
||||
Fc: fcByte,
|
||||
StartAddress: startAddr,
|
||||
Qty: qty,
|
||||
InterestedParties: [new InterestedParty(pipe, originalTxId)],
|
||||
SentAtUtc: DateTimeOffset.UtcNow);
|
||||
|
||||
if (!_correlation.TryAdd(proxyTxId, inFlight))
|
||||
{
|
||||
// Should be impossible: the allocator just guaranteed proxyTxId is free.
|
||||
_allocator.Release(proxyTxId);
|
||||
_logger.LogError("CorrelationMap.TryAdd failed for already-free proxyTxId {ProxyTxId}", proxyTxId);
|
||||
return;
|
||||
}
|
||||
|
||||
// Peak in-flight tracking.
|
||||
_ctx.Counters.ObserveInFlight(_allocator.InFlightCount);
|
||||
|
||||
// Apply the BCD rewriter on the request. Use a per-call context with CurrentRequest
|
||||
// (the rewriter doesn't currently need it on request, but Phase 10 may).
|
||||
var requestCtx = _ctx.WithCurrentRequest(inFlight);
|
||||
_pipeline.Process(
|
||||
MbapDirection.RequestToBackend,
|
||||
frame.AsSpan(0, MbapFrame.HeaderSize),
|
||||
frame.AsSpan(MbapFrame.HeaderSize, frame.Length - MbapFrame.HeaderSize),
|
||||
requestCtx);
|
||||
|
||||
// Overwrite the MBAP TxId with the proxy TxId.
|
||||
frame[0] = (byte)(proxyTxId >> 8);
|
||||
frame[1] = (byte)(proxyTxId & 0xFF);
|
||||
|
||||
// Enqueue for the backend writer task.
|
||||
try
|
||||
{
|
||||
await _outboundChannel.Writer.WriteAsync(frame, ct).ConfigureAwait(false);
|
||||
}
|
||||
catch (ChannelClosedException)
|
||||
{
|
||||
// Channel completed during shutdown — release the proxy TxId.
|
||||
if (_correlation.TryRemove(proxyTxId, out _))
|
||||
_allocator.Release(proxyTxId);
|
||||
}
|
||||
}
|
||||
|
||||
// ── Per-request timeout watchdog ──────────────────────────────────────────
|
||||
|
||||
/// <summary>
|
||||
/// Periodically scans the correlation map for in-flight requests whose response has
|
||||
/// not arrived within <see cref="ConnectionOptions.BackendRequestTimeoutMs"/>. For each
|
||||
/// stale entry: removes it from the map, frees its allocator slot, and delivers a
|
||||
/// Modbus exception (code 0x0B / Gateway Target Device Failed To Respond) to each
|
||||
/// interested party with the original TxId restored.
|
||||
///
|
||||
/// <para><b>Why this exists.</b> In the 1:1 connection model, a lost response would
|
||||
/// fault the dedicated backend socket and the upstream pair would close. The multiplexed
|
||||
/// model needs an explicit per-request timer because a single missing or mis-routed
|
||||
/// response would otherwise leak a correlation entry forever and hang the upstream
|
||||
/// pipe indefinitely. Real-world causes: PLC drops a response, network packet loss,
|
||||
/// backend that mis-echoes MBAP TxIds.</para>
|
||||
/// </summary>
|
||||
private async Task RunRequestTimeoutWatchdogAsync(CancellationToken ct)
|
||||
{
|
||||
// Tick at ~quarter of the request timeout for responsive cleanup, but cap to a
|
||||
// 1-second floor so the watchdog doesn't busy-wake on very small timeouts.
|
||||
int tickMs = Math.Max(100, _connectionOptions.BackendRequestTimeoutMs / 4);
|
||||
|
||||
try
|
||||
{
|
||||
while (!ct.IsCancellationRequested)
|
||||
{
|
||||
await Task.Delay(tickMs, ct).ConfigureAwait(false);
|
||||
|
||||
var threshold = DateTimeOffset.UtcNow.AddMilliseconds(-_connectionOptions.BackendRequestTimeoutMs);
|
||||
var stale = _correlation.SnapshotOlderThan(threshold);
|
||||
if (stale.Count == 0) continue;
|
||||
|
||||
foreach (var kvp in stale)
|
||||
{
|
||||
ushort proxyTxId = kvp.Key;
|
||||
// Try to claim the entry; if another path (response, cascade) already removed it,
|
||||
// skip — no work to do.
|
||||
if (!_correlation.TryRemove(proxyTxId, out var req))
|
||||
continue;
|
||||
|
||||
_allocator.Release(proxyTxId);
|
||||
|
||||
long elapsedMs = (long)(DateTimeOffset.UtcNow - req.SentAtUtc).TotalMilliseconds;
|
||||
|
||||
foreach (var party in req.InterestedParties)
|
||||
{
|
||||
MultiplexerLogEvents.RequestTimeout(
|
||||
_logger, _plc.Name, proxyTxId, party.OriginalTxId, req.Fc, elapsedMs);
|
||||
|
||||
if (!party.Pipe.IsAlive)
|
||||
continue;
|
||||
|
||||
// Deliver Modbus exception 0x0B (Gateway Target Device Failed To Respond)
|
||||
// to the upstream client. This lets the client's library raise a clean
|
||||
// ModbusException rather than hanging on a timeout.
|
||||
byte[] excFrame = BuildExceptionFrame(party.OriginalTxId, req.UnitId, req.Fc, exceptionCode: 0x0B);
|
||||
try
|
||||
{
|
||||
await party.Pipe.SendResponseAsync(excFrame, ct).ConfigureAwait(false);
|
||||
}
|
||||
catch
|
||||
{
|
||||
// Best-effort delivery; if the pipe is going down, the client
|
||||
// discovers the failure through its own socket close path.
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (OperationCanceledException)
|
||||
{
|
||||
// Normal teardown.
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Request-timeout watchdog faulted: Plc={Plc}", _plc.Name);
|
||||
}
|
||||
}
|
||||
|
||||
// ── Helpers ───────────────────────────────────────────────────────────────
|
||||
|
||||
private static async Task<bool> FillAsync(
|
||||
Socket socket, byte[] buf, int offset, int count, CancellationToken ct)
|
||||
{
|
||||
int remaining = count;
|
||||
while (remaining > 0)
|
||||
{
|
||||
int n = await socket.ReceiveAsync(
|
||||
buf.AsMemory(offset + (count - remaining), remaining),
|
||||
SocketFlags.None, ct).ConfigureAwait(false);
|
||||
if (n == 0) return false;
|
||||
remaining -= n;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private static byte[] BuildExceptionFrame(ushort originalTxId, byte unitId, byte fc, byte exceptionCode)
|
||||
{
|
||||
// Modbus exception PDU = [fc | 0x80][exceptionCode].
|
||||
// MBAP length covers UnitId (1) + PDU (2) = 3.
|
||||
var frame = new byte[MbapFrame.HeaderSize + 2];
|
||||
frame[0] = (byte)(originalTxId >> 8);
|
||||
frame[1] = (byte)(originalTxId & 0xFF);
|
||||
frame[2] = 0; // ProtocolId
|
||||
frame[3] = 0;
|
||||
frame[4] = 0; // Length high
|
||||
frame[5] = 3; // Length low: UnitId(1) + ExFc(1) + ExCode(1)
|
||||
frame[6] = unitId;
|
||||
frame[7] = (byte)(fc | 0x80);
|
||||
frame[8] = exceptionCode;
|
||||
return frame;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user