mbproxy: initial commit through Phase 9 (TxId multiplexing)

Adds the mbproxy service end-to-end. Phases 00-08 implement the
production-ready single-listener / 1:1-backend transparent Modbus TCP
proxy with bidirectional BCD rewriting for the ~54-PLC DL205/DL260
fleet. Phase 9 replaces the connection layer with a single backend
socket per PLC plus MBAP TxId rewriting, lifting the H2-ECOM100's
4-concurrent-client cap as an operational ceiling.

Phase 9 additions of note:
- PlcMultiplexer + UpstreamPipe + TxIdAllocator + CorrelationMap
- InFlightRequest with IReadOnlyList<InterestedParty> (load-bearing
  for Phase 10 read coalescing — do not collapse to a single field)
- Per-request watchdog: surfaces Modbus exception 0x0B to upstream
  on BackendRequestTimeoutMs, defending against lost responses,
  dead-PLC paths, and pymodbus 3.13.0's concurrent-multiplexed-
  request bug (its ServerRequestHandler.last_pdu state race)
- Status DTO + HTML gain inFlight / maxInFlight / txIdWraps /
  disconnectCascades / queueDepth (Tier 1.6 in docs/kpi.md)

Tests: 263 unit + 38 E2E. Multiplexer correctness under truly
concurrent backend traffic is proved against a stub backend in
PlcMultiplexerTests; MultiplexerE2ETests paces requests so pymodbus
3.13's single-PDU framer stays in known-good mode.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Joseph Doherty
2026-05-14 01:49:35 -04:00
parent 2e937228a0
commit 56eee3c563
105 changed files with 18430 additions and 0 deletions
+460
View File
@@ -0,0 +1,460 @@
using Mbproxy.Bcd;
namespace Mbproxy.Proxy;
/// <summary>
/// BCD-rewriting PDU pipeline. Registered as the singleton <see cref="IPduPipeline"/>
/// in production (replaces <see cref="NoopPduPipeline"/> from Phase 03).
///
/// FC scope (per design.md):
/// FC03 / FC04 response — decode covered BCD slots from raw nibbles → binary integer.
/// FC06 request — encode binary integer → BCD nibbles.
/// FC16 request — per-register over the configured slots.
/// All other FCs — pass through byte-for-byte.
///
/// MBAP transparency contract: the MBAP length field is NEVER modified. Re-encoded slots
/// are the same byte width as the originals (ushort → ushort), so the PDU length is stable.
///
/// <para><b>Phase 9 — request correlation:</b> FC03/FC04 responses do not carry the
/// original start address. The multiplexer builds an <see cref="Multiplexing.InFlightRequest"/>
/// on the request path, stores it in its <see cref="Multiplexing.CorrelationMap"/>, and
/// attaches it to the per-call <see cref="PerPlcContext.CurrentRequest"/> on the response
/// path. The rewriter consumes <c>CurrentRequest</c> instead of a per-pair last-request
/// slot, so concurrent responses from different upstream clients each decode against
/// their own request range without cross-talk.</para>
///
/// <para>This class is stateless. All per-call state arrives via <see cref="PduContext"/>
/// (specifically <see cref="PerPlcContext.CurrentRequest"/> on response). It is safe to
/// call concurrently from multiple upstream-read tasks and the single backend reader task.</para>
/// </summary>
internal sealed class BcdPduPipeline : IPduPipeline
{
// ── IPduPipeline.Process ─────────────────────────────────────────────────
public void Process(
MbapDirection direction,
ReadOnlySpan<byte> mbapHeader,
Span<byte> pdu,
PduContext context)
{
// PerPlcContext carries the BCD map, counters, and logger.
// If the caller passes a plain PduContext (e.g. in unit tests using NoopPduPipeline
// alongside this one), we skip BCD processing gracefully.
if (context is not PerPlcContext ctx)
return;
if (pdu.Length < 1)
return;
byte fc = pdu[0];
ctx.Counters.IncrementPdusForwarded();
ctx.Counters.IncrementFcCount(fc);
if (direction == MbapDirection.RequestToBackend)
{
ProcessRequest(fc, pdu, ctx);
}
else
{
ProcessResponse(fc, pdu, ctx);
}
}
// ── Request processing (FC06 / FC16) ────────────────────────────────────
private static void ProcessRequest(byte fc, Span<byte> pdu, PerPlcContext ctx)
{
switch (fc)
{
case 0x06:
ProcessFc06Request(pdu, ctx);
break;
case 0x10:
ProcessFc16Request(pdu, ctx);
break;
// All other FCs: transparent pass-through.
}
}
/// <summary>
/// FC06 Write Single Register request: [fc=06][addrHi][addrLo][valHi][valLo]
/// If the address is a configured 16-bit BCD tag, encode the client's binary integer
/// as BCD nibbles before forwarding to the PLC.
/// Partial-overlap (address is part of a 32-bit pair): warn + pass through raw.
/// </summary>
private static void ProcessFc06Request(Span<byte> pdu, PerPlcContext ctx)
{
if (pdu.Length < 5)
return;
ushort address = (ushort)((pdu[1] << 8) | pdu[2]);
ushort value = (ushort)((pdu[3] << 8) | pdu[4]);
// Direct point lookup at the exact address.
if (!ctx.TagMap.TryGet(address, out var tag))
{
// Not a BCD address — but check whether this address is the HIGH register
// of a 32-bit pair (Address+1 where Address is configured as 32-bit).
// TryGetForRange with qty=1 will catch that partial-overlap case.
if (ctx.TagMap.TryGetForRange(address, 1, out var hits) && hits.Count > 0)
{
// The only hit should be a 32-bit tag whose high register is at `address`.
foreach (var hit in hits)
{
if (hit.Tag.IsThirtyTwoBit && hit.OffsetWords < 0)
{
// This address is the high register of the 32-bit pair.
RewriterLogEvents.PartialBcd(ctx.Logger, ctx.PlcName, address, address, 1);
ctx.Counters.IncrementPartialBcd();
return;
}
}
}
return;
}
if (tag.IsThirtyTwoBit)
{
// FC06 writes exactly one register. If this is the LOW address of a 32-bit tag,
// that's a partial write. Per design partial-overlap policy: warn + pass through.
RewriterLogEvents.PartialBcd(ctx.Logger, ctx.PlcName, address, address, 1);
ctx.Counters.IncrementPartialBcd();
return;
}
// 16-bit tag: encode client's binary integer as BCD nibbles.
ushort encoded;
try
{
encoded = BcdCodec.Encode16(value);
}
catch (ArgumentOutOfRangeException)
{
// Value is outside [0, 9999] — cannot represent as 4-digit BCD.
RewriterLogEvents.InvalidBcd(ctx.Logger, ctx.PlcName, address, value, "Write");
ctx.Counters.IncrementInvalidBcd();
return; // pass through raw
}
pdu[3] = (byte)(encoded >> 8);
pdu[4] = (byte)(encoded & 0xFF);
ctx.Counters.AddRewrittenSlots(1);
}
/// <summary>
/// FC16 Write Multiple Registers request:
/// [fc=10][startHi][startLo][qtyHi][qtyLo][byteCount][reg0Hi][reg0Lo]...
/// Re-encodes binary integers at configured BCD addresses to BCD nibbles.
/// </summary>
private static void ProcessFc16Request(Span<byte> pdu, PerPlcContext ctx)
{
// Minimum FC16 request PDU: fc(1) + start(2) + qty(2) + byteCount(1) = 6 bytes.
if (pdu.Length < 6)
return;
ushort startAddress = (ushort)((pdu[1] << 8) | pdu[2]);
ushort qty = (ushort)((pdu[3] << 8) | pdu[4]);
// byte byteCount = pdu[5]; (qty * 2, not used directly)
if (!ctx.TagMap.TryGetForRange(startAddress, qty, out var hits))
return; // no BCD tags in this range
int dataOffset = 6; // pdu[6..] = register data, 2 bytes per register
foreach (var hit in hits)
{
int offsetWords = hit.OffsetWords;
var tag = hit.Tag;
if (tag.IsThirtyTwoBit)
{
// Full 32-bit pair fits if both low (offsetWords) and high (offsetWords+1)
// are within the [0, qty) range.
bool lowInRange = offsetWords >= 0 && offsetWords < qty;
bool highInRange = (offsetWords + 1) >= 0 && (offsetWords + 1) < qty;
if (!lowInRange || !highInRange)
{
// Partial overlap — one of the two registers is outside the write range.
RewriterLogEvents.PartialBcd(ctx.Logger, ctx.PlcName,
tag.Address, startAddress, qty);
ctx.Counters.IncrementPartialBcd();
continue;
}
// Both registers are in range. Read the low/high words from the PDU.
int lowByteOff = dataOffset + offsetWords * 2;
int highByteOff = dataOffset + (offsetWords + 1) * 2;
if (lowByteOff + 2 > pdu.Length || highByteOff + 2 > pdu.Length)
continue; // malformed PDU — skip safely
// Per CDAB layout:
// pdu[lowByteOff..+2] = low register (low 4 BCD digits of value)
// pdu[highByteOff..+2] = high register (high 4 BCD digits of value)
// The client sends binary integers; encode to BCD nibbles.
//
// Design note: for a 32-bit write the client sends a 32-bit binary value
// split across two registers in CDAB order (low word at Address,
// high word at Address+1). We reconstruct the int and encode it.
ushort clientLow = (ushort)((pdu[lowByteOff] << 8) | pdu[lowByteOff + 1]);
ushort clientHigh = (ushort)((pdu[highByteOff] << 8) | pdu[highByteOff + 1]);
// Reconstruct the 32-bit binary value (CDAB: low-word = low digits).
int binaryValue = clientHigh * 10_000 + clientLow;
ushort bcdLow, bcdHigh;
try
{
(bcdLow, bcdHigh) = BcdCodec.Encode32(binaryValue);
}
catch (ArgumentOutOfRangeException)
{
RewriterLogEvents.InvalidBcd(ctx.Logger, ctx.PlcName, tag.Address,
clientLow, "Write");
ctx.Counters.IncrementInvalidBcd();
continue;
}
pdu[lowByteOff] = (byte)(bcdLow >> 8);
pdu[lowByteOff + 1] = (byte)(bcdLow & 0xFF);
pdu[highByteOff] = (byte)(bcdHigh >> 8);
pdu[highByteOff + 1] = (byte)(bcdHigh & 0xFF);
ctx.Counters.AddRewrittenSlots(2);
}
else
{
// 16-bit tag.
if (offsetWords < 0 || offsetWords >= qty)
continue; // outside range (shouldn't happen for 16-bit but be defensive)
int byteOff = dataOffset + offsetWords * 2;
if (byteOff + 2 > pdu.Length)
continue;
ushort clientValue = (ushort)((pdu[byteOff] << 8) | pdu[byteOff + 1]);
ushort encoded;
try
{
encoded = BcdCodec.Encode16(clientValue);
}
catch (ArgumentOutOfRangeException)
{
RewriterLogEvents.InvalidBcd(ctx.Logger, ctx.PlcName, tag.Address,
clientValue, "Write");
ctx.Counters.IncrementInvalidBcd();
continue;
}
pdu[byteOff] = (byte)(encoded >> 8);
pdu[byteOff + 1] = (byte)(encoded & 0xFF);
ctx.Counters.AddRewrittenSlots(1);
}
}
}
// ── Response processing (FC03 / FC04) ───────────────────────────────────
private static void ProcessResponse(byte fc, Span<byte> pdu, PerPlcContext ctx)
{
// Check for Modbus exception response (high bit of FC is set).
if ((fc & 0x80) != 0)
{
// Exception response: [fc|0x80][exceptionCode]
byte originalFc = (byte)(fc & 0x7F);
byte exceptionCode = pdu.Length >= 2 ? pdu[1] : (byte)0;
RewriterLogEvents.ExceptionPassthrough(ctx.Logger, ctx.PlcName, originalFc, exceptionCode);
ctx.Counters.IncrementBackendException(exceptionCode);
return; // pass through raw
}
switch (fc)
{
case 0x03:
case 0x04:
// Handled below.
break;
case 0x06:
// FC06 response echoes [fc][addrHi][addrLo][valHi][valLo].
// Since the proxy re-encoded the request (binary→BCD), the PLC echoes back
// BCD nibbles. The client expects its original binary value. Decode here.
ProcessFc06Response(pdu, ctx);
return;
case 0x10:
// FC16 response: [fc][startHi][startLo][qtyHi][qtyLo] — no register data.
return;
default:
return; // all other FCs pass through
}
// FC03/04 response: [fc][byteCount][reg0Hi][reg0Lo]...
// The start address is NOT in the response — the multiplexer attaches the matched
// InFlightRequest to ctx.CurrentRequest on the response path. Without it (e.g., a
// unit-test fixture invoking the pipeline directly without correlation) we cannot
// decode safely; pass the bytes through.
var currentReq = ctx.CurrentRequest;
if (currentReq is null)
return;
// Only FC03/04 responses should consult start/qty.
if (currentReq.Fc != 0x03 && currentReq.Fc != 0x04)
return;
ushort startAddress = currentReq.StartAddress;
ushort qty = currentReq.Qty;
if (pdu.Length < 2)
return;
int byteCount = pdu[1];
int wordsInResponse = byteCount / 2;
// Sanity: the qty in the request should match the words in the response.
// Use the smaller of the two to stay in bounds.
ushort effectiveQty = (ushort)Math.Min(qty, wordsInResponse);
if (!ctx.TagMap.TryGetForRange(startAddress, effectiveQty, out var hits))
return;
int dataOffset = 2; // pdu[2..] = register data
foreach (var hit in hits)
{
int offsetWords = hit.OffsetWords;
var tag = hit.Tag;
if (tag.IsThirtyTwoBit)
{
bool lowInRange = offsetWords >= 0 && offsetWords < effectiveQty;
bool highInRange = (offsetWords + 1) >= 0 && (offsetWords + 1) < effectiveQty;
if (!lowInRange || !highInRange)
{
RewriterLogEvents.PartialBcd(ctx.Logger, ctx.PlcName,
tag.Address, startAddress, qty);
ctx.Counters.IncrementPartialBcd();
continue;
}
int lowByteOff = dataOffset + offsetWords * 2;
int highByteOff = dataOffset + (offsetWords + 1) * 2;
if (lowByteOff + 2 > pdu.Length || highByteOff + 2 > pdu.Length)
continue;
// CDAB: Address = low register (low 4 BCD digits), Address+1 = high register
ushort rawLow = (ushort)((pdu[lowByteOff] << 8) | pdu[lowByteOff + 1]);
ushort rawHigh = (ushort)((pdu[highByteOff] << 8) | pdu[highByteOff + 1]);
int decoded;
try
{
decoded = BcdCodec.Decode32(rawLow, rawHigh);
}
catch (FormatException)
{
// Emit invalid_bcd for the low register (first bad word we'd encounter).
ushort badRaw = HasBadNibble(rawLow) ? rawLow : rawHigh;
ushort badAddr = HasBadNibble(rawLow) ? tag.Address : tag.HighRegister;
RewriterLogEvents.InvalidBcd(ctx.Logger, ctx.PlcName, badAddr, badRaw, "Read");
ctx.Counters.IncrementInvalidBcd();
continue;
}
// Write decoded binary value back as a 32-bit value in CDAB layout.
// The client receives low 4 digits at Address and high 4 digits at Address+1.
int decodedLow = decoded % 10_000;
int decodedHigh = decoded / 10_000;
pdu[lowByteOff] = (byte)(decodedLow >> 8);
pdu[lowByteOff + 1] = (byte)(decodedLow & 0xFF);
pdu[highByteOff] = (byte)(decodedHigh >> 8);
pdu[highByteOff + 1] = (byte)(decodedHigh & 0xFF);
ctx.Counters.AddRewrittenSlots(2);
}
else
{
// 16-bit tag.
if (offsetWords < 0 || offsetWords >= effectiveQty)
continue;
int byteOff = dataOffset + offsetWords * 2;
if (byteOff + 2 > pdu.Length)
continue;
ushort raw = (ushort)((pdu[byteOff] << 8) | pdu[byteOff + 1]);
int decoded;
try
{
decoded = BcdCodec.Decode16(raw);
}
catch (FormatException)
{
RewriterLogEvents.InvalidBcd(ctx.Logger, ctx.PlcName, tag.Address, raw, "Read");
ctx.Counters.IncrementInvalidBcd();
continue;
}
pdu[byteOff] = (byte)(decoded >> 8);
pdu[byteOff + 1] = (byte)(decoded & 0xFF);
ctx.Counters.AddRewrittenSlots(1);
}
}
}
/// <summary>
/// FC06 response: [fc=06][addrHi][addrLo][valHi][valLo] — echoes the register address
/// and the value the PLC wrote (which is now BCD-encoded if the request was rewritten).
/// Decode the BCD nibbles back to the client's original binary integer so the client
/// sees the value it sent and library validation (e.g. NModbus echo-check) passes.
/// </summary>
private static void ProcessFc06Response(Span<byte> pdu, PerPlcContext ctx)
{
if (pdu.Length < 5)
return;
ushort address = (ushort)((pdu[1] << 8) | pdu[2]);
ushort raw = (ushort)((pdu[3] << 8) | pdu[4]);
if (!ctx.TagMap.TryGet(address, out var tag))
return; // not a BCD address
if (tag.IsThirtyTwoBit)
return; // partial-write echo — pass through (already warned on request)
// 16-bit tag: the PLC echoed back BCD nibbles. Decode them back to binary.
int decoded;
try
{
decoded = BcdCodec.Decode16(raw);
}
catch (FormatException)
{
RewriterLogEvents.InvalidBcd(ctx.Logger, ctx.PlcName, address, raw, "Read");
ctx.Counters.IncrementInvalidBcd();
return;
}
pdu[3] = (byte)(decoded >> 8);
pdu[4] = (byte)(decoded & 0xFF);
// Note: the RewrittenSlots counter is NOT incremented here because the request
// already counted this slot on the way out. Incrementing again would double-count.
}
// ── Helpers ──────────────────────────────────────────────────────────────
/// <summary>Returns true if any nibble of <paramref name="raw"/> is >= 0xA.</summary>
private static bool HasBadNibble(ushort raw)
=> ((raw >> 12) & 0xF) >= 0xA
|| ((raw >> 8) & 0xF) >= 0xA
|| ((raw >> 4) & 0xF) >= 0xA
|| (raw & 0xF) >= 0xA;
}
+47
View File
@@ -0,0 +1,47 @@
namespace Mbproxy.Proxy;
/// <summary>
/// Direction of a Modbus PDU being processed by the pipeline.
/// </summary>
public enum MbapDirection
{
/// <summary>A request frame travelling from an upstream client to the backend PLC.</summary>
RequestToBackend,
/// <summary>A response frame travelling from the backend PLC back to the upstream client.</summary>
ResponseToClient,
}
/// <summary>
/// Per-pair context carried through each PDU pipeline call.
/// Phase 03: carries only <see cref="PlcName"/>.
/// Phase 04 extends this via <see cref="PerPlcContext"/>, which carries the BcdTagMap,
/// counters, and logger. Phase 09 added the per-call <c>CurrentRequest</c> slot to
/// <see cref="PerPlcContext"/> for multiplexer-aware response correlation.
/// </summary>
public class PduContext
{
/// <summary>The configured PLC name (from <c>MbproxyOptions.Plcs[i].Name</c>).</summary>
public string PlcName { get; init; } = "";
// Phase 04 adds: BcdTagMap, counters, logger
}
/// <summary>
/// Hook contract for inspecting and rewriting Modbus PDU bytes inline.
/// Called once per frame in each direction (request and response).
///
/// Implementations must be safe to call concurrently from multiple connection pairs.
/// In Phase 03 the only implementation is <see cref="NoopPduPipeline"/> (pass-through).
/// Phase 04 replaces it with a BCD rewriter registered via DI.
/// </summary>
public interface IPduPipeline
{
/// <summary>
/// Processes a single Modbus PDU. Implementations may mutate <paramref name="pdu"/> in place.
/// </summary>
/// <param name="direction">Whether this is a request or a response frame.</param>
/// <param name="mbapHeader">The 7-byte MBAP header (read-only; includes TxId, UnitId, FC is in pdu[0]).</param>
/// <param name="pdu">The PDU bytes starting at the function code. May be mutated in place.</param>
/// <param name="context">Per-pair context (PLC name; extended in phase 04).</param>
void Process(MbapDirection direction, ReadOnlySpan<byte> mbapHeader, Span<byte> pdu, PduContext context);
}
+60
View File
@@ -0,0 +1,60 @@
namespace Mbproxy.Proxy;
/// <summary>
/// Pure, allocation-free helpers for parsing Modbus Application Protocol (MBAP) headers.
///
/// MBAP frame layout (7-byte header + PDU):
/// [0..1] TxId (big-endian uint16)
/// [2..3] ProtocolId (big-endian uint16; always 0 for standard Modbus)
/// [4..5] Length (big-endian uint16; covers UnitId + PDU bytes)
/// [6] UnitId
/// [7..] PDU (function code + data); length is (lengthField - 1) bytes
///
/// Total frame bytes = 6 (fixed header without length's coverage) + lengthField
/// = 7 (header) + (lengthField - 1) (PDU body without UnitId).
/// </summary>
internal static class MbapFrame
{
/// <summary>Number of bytes in the MBAP header (TxId + ProtocolId + Length + UnitId).</summary>
public const int HeaderSize = 7;
/// <summary>Maximum MBAP PDU body size (Modbus spec max: 253 bytes).</summary>
public const int MaxPduBodySize = 253;
/// <summary>Per-pair buffer size: header (7) + max PDU body (253) = 260 bytes.</summary>
public const int BufferSize = HeaderSize + MaxPduBodySize;
/// <summary>
/// Parses all fields from a 7-byte MBAP header buffer.
/// Returns <c>false</c> when <paramref name="buffer"/> is shorter than 7 bytes.
/// Does NOT validate <paramref name="protocolId"/> or <paramref name="length"/> —
/// that is the caller's responsibility (and ultimately the PLC's job).
/// </summary>
public static bool TryParseHeader(
ReadOnlySpan<byte> buffer,
out ushort txId,
out ushort protocolId,
out ushort length,
out byte unitId)
{
if (buffer.Length < HeaderSize)
{
txId = protocolId = length = 0;
unitId = 0;
return false;
}
txId = (ushort)((buffer[0] << 8) | buffer[1]);
protocolId = (ushort)((buffer[2] << 8) | buffer[3]);
length = (ushort)((buffer[4] << 8) | buffer[5]);
unitId = buffer[6];
return true;
}
/// <summary>
/// Returns the total frame length in bytes given the MBAP length field.
/// Formula: 6 (TxId + ProtocolId + LengthField bytes) + lengthField
/// = 7 (full header) + (lengthField - 1) (PDU body without UnitId).
/// </summary>
public static int TotalFrameLength(ushort lengthField) => 6 + lengthField;
}
@@ -0,0 +1,82 @@
using System.Collections.Concurrent;
namespace Mbproxy.Proxy.Multiplexing;
/// <summary>
/// Maps a proxy-assigned MBAP TxId → <see cref="InFlightRequest"/>. The multiplexer's
/// per-upstream <c>OnFrame</c> path adds entries; the backend reader task removes them
/// when the matching response arrives.
///
/// <para>Backed by <see cref="ConcurrentDictionary{TKey, TValue}"/>. The single-writer /
/// single-remover pattern in Phase 9 does not strictly require it — but cascade-on-
/// disconnect walks the map from a separate task and Phase 10 adds upstream-side
/// cancellation paths, so the safer primitive is worth the negligible cost.</para>
/// </summary>
internal sealed class CorrelationMap
{
private readonly ConcurrentDictionary<ushort, InFlightRequest> _entries = new();
/// <summary>
/// Adds <paramref name="req"/> under <paramref name="proxyTxId"/>. Returns <c>false</c>
/// if a request was already stored under that key — which would be a programming
/// error (the allocator should never hand out the same key twice while it is still
/// in flight). Callers should treat <c>false</c> as a fatal contract violation and
/// drop the upstream connection.
/// </summary>
public bool TryAdd(ushort proxyTxId, InFlightRequest req)
=> _entries.TryAdd(proxyTxId, req);
/// <summary>
/// Removes the entry under <paramref name="proxyTxId"/>. Returns <c>false</c> when
/// no entry exists (which is normal for cascade cleanup and for stale-response paths).
/// </summary>
public bool TryRemove(ushort proxyTxId, out InFlightRequest req)
=> _entries.TryRemove(proxyTxId, out req!);
/// <summary>Number of currently-in-flight requests.</summary>
public int Count => _entries.Count;
/// <summary>
/// Returns a point-in-time copy of all in-flight requests. Allocates a list; intended
/// for diagnostics (cascade walk on backend disconnect; future drain-on-shutdown).
/// </summary>
public IReadOnlyCollection<InFlightRequest> Snapshot()
{
// ConcurrentDictionary.Values is a snapshot-safe enumerable; materialise to
// detach from the live dictionary and give callers a stable view.
return _entries.Values.ToArray();
}
/// <summary>
/// Returns and removes every entry. Used by the multiplexer's cascade path when the
/// backend socket dies — the multiplexer must close every interested upstream pipe
/// and free every allocated proxy TxId.
/// </summary>
public IReadOnlyList<KeyValuePair<ushort, InFlightRequest>> DrainAll()
{
var drained = new List<KeyValuePair<ushort, InFlightRequest>>(_entries.Count);
foreach (var kvp in _entries)
{
if (_entries.TryRemove(kvp.Key, out var req))
drained.Add(new KeyValuePair<ushort, InFlightRequest>(kvp.Key, req));
}
return drained;
}
/// <summary>
/// Returns a snapshot of (proxyTxId, InFlightRequest) pairs whose <see cref="InFlightRequest.SentAtUtc"/>
/// is older than <paramref name="threshold"/>. Allocates a list; intended for the
/// periodic per-request timeout watchdog only. The entries are NOT removed by this
/// call — the caller decides which to time out.
/// </summary>
public IReadOnlyList<KeyValuePair<ushort, InFlightRequest>> SnapshotOlderThan(DateTimeOffset threshold)
{
var stale = new List<KeyValuePair<ushort, InFlightRequest>>();
foreach (var kvp in _entries)
{
if (kvp.Value.SentAtUtc <= threshold)
stale.Add(new KeyValuePair<ushort, InFlightRequest>(kvp.Key, kvp.Value));
}
return stale;
}
}
@@ -0,0 +1,41 @@
namespace Mbproxy.Proxy.Multiplexing;
/// <summary>
/// One upstream party interested in a single backend round-trip. Carries the upstream
/// pipe to deliver the response to AND the original MBAP TxId that the party sent — the
/// multiplexer must rewrite the response's MBAP TxId back to <see cref="OriginalTxId"/>
/// before handing the frame to the pipe, so each upstream sees the proxy as transparent.
///
/// <para><b>Phase 9 invariant:</b> exactly one <see cref="InterestedParty"/> per
/// <see cref="InFlightRequest"/>. <b>Phase 10 (read coalescing)</b> reuses this exact
/// shape to fan-out a single backend response to multiple upstream parties. Do not
/// collapse this into a single field on <see cref="InFlightRequest"/>.</para>
/// </summary>
internal sealed record InterestedParty(UpstreamPipe Pipe, ushort OriginalTxId);
/// <summary>
/// Per-backend-request correlation record. Stored in <see cref="CorrelationMap"/> keyed
/// by the proxy-assigned TxId; looked up by the backend reader task to:
/// <list type="bullet">
/// <item><description>Restore each interested party's original MBAP TxId before forwarding
/// the response upstream (transparent multiplexing contract).</description></item>
/// <item><description>Provide the BCD rewriter with the originating request's
/// <c>StartAddress</c> / <c>Qty</c> for FC03/FC04 response decoding — the response
/// PDU itself does not carry the start address.</description></item>
/// <item><description>Measure backend round-trip time via <see cref="SentAtUtc"/>
/// (replaces the per-pair stopwatch slot from the 1:1 model).</description></item>
/// </list>
///
/// <para><b>Phase 9:</b> <see cref="InterestedParties"/> always has exactly one element.
/// The list shape is the load-bearing seam that <b>Phase 10 — read coalescing</b> hooks
/// into to fan out a single PLC response to multiple upstream clients without further
/// refactor of the multiplexer's data model. Reviewer note: do <i>not</i> simplify back
/// to a single <c>UpstreamPipe</c> field.</para>
/// </summary>
internal sealed record InFlightRequest(
byte UnitId,
byte Fc,
ushort StartAddress,
ushort Qty,
IReadOnlyList<InterestedParty> InterestedParties,
DateTimeOffset SentAtUtc);
@@ -0,0 +1,121 @@
namespace Mbproxy.Proxy.Multiplexing;
/// <summary>
/// Source-generated <see cref="LoggerMessage"/> definitions for the TxId-multiplexing
/// connection layer. Event names are stable — do not rename without updating
/// docs/design.md's "Logging" event-name table.
/// </summary>
internal static partial class MultiplexerLogEvents
{
/// <summary>
/// Emitted once per upstream client accept. Replaces the per-pair
/// <c>mbproxy.client.connected</c> event from the 1:1 model (same event name,
/// same property shape — operators' log queries are unchanged).
/// </summary>
[LoggerMessage(
EventId = 110,
EventName = "mbproxy.client.connected",
Level = LogLevel.Information,
Message = "Client connected: Plc={Plc} RemoteEp={RemoteEp}")]
public static partial void ClientConnected(
ILogger logger,
string plc,
string remoteEp);
/// <summary>
/// Emitted when an upstream pipe is closed (clean disconnect, fault, or cascade).
/// </summary>
[LoggerMessage(
EventId = 111,
EventName = "mbproxy.client.disconnected",
Level = LogLevel.Information,
Message = "Client disconnected: Plc={Plc} RemoteEp={RemoteEp} Reason={Reason}")]
public static partial void ClientDisconnected(
ILogger logger,
string plc,
string remoteEp,
string reason);
/// <summary>
/// Emitted when the multiplexer successfully opens its single backend connection to a PLC.
/// </summary>
[LoggerMessage(
EventId = 112,
EventName = "mbproxy.multiplex.backend.connected",
Level = LogLevel.Information,
Message = "Backend multiplex connection up: Plc={Plc} Host={Host} Port={Port}")]
public static partial void BackendConnected(
ILogger logger,
string plc,
string host,
int port);
/// <summary>
/// Emitted when the multiplexer cascades a backend disconnect to all attached upstream
/// clients. <c>UpstreamCount</c> is the number of upstream pipes that were closed and
/// <c>InFlightCount</c> is the number of in-flight requests dropped.
/// </summary>
[LoggerMessage(
EventId = 113,
EventName = "mbproxy.multiplex.backend.disconnected",
Level = LogLevel.Warning,
Message = "Backend multiplex connection down: Plc={Plc} UpstreamCount={UpstreamCount} InFlightCount={InFlightCount} Reason={Reason}")]
public static partial void BackendDisconnected(
ILogger logger,
string plc,
int upstreamCount,
int inFlightCount,
string reason);
/// <summary>
/// Emitted once when the TxId allocator refuses to allocate — every slot in the 16-bit
/// space is currently in flight. The multiplexer responds to the upstream with a
/// Modbus exception (code 04 / Slave Device Failure). Realistically unreachable under
/// normal load (ECOM serializes at ~2-10 ms per request); a stress-only path.
/// </summary>
[LoggerMessage(
EventId = 114,
EventName = "mbproxy.multiplex.saturated",
Level = LogLevel.Error,
Message = "Multiplexer TxId space saturated — returning exception 04 to upstream: Plc={Plc} RemoteEp={RemoteEp}")]
public static partial void Saturated(
ILogger logger,
string plc,
string remoteEp);
/// <summary>
/// Emitted when the backend connect Polly pipeline fails. Mirrors the existing
/// <c>mbproxy.backend.failed</c> event from the 1:1 model so operators' alerts keep
/// working unchanged after Phase 9.
/// </summary>
[LoggerMessage(
EventId = 115,
EventName = "mbproxy.backend.failed",
Level = LogLevel.Warning,
Message = "Backend connect failed: Plc={Plc} Reason={Reason}")]
public static partial void BackendFailed(
ILogger logger,
string plc,
string reason);
/// <summary>
/// Emitted when the per-request watchdog times out an in-flight request whose response
/// never arrived within <c>BackendRequestTimeoutMs</c>. The upstream party receives a
/// Modbus exception (code 0x0B / Gateway Target Device Failed To Respond) and the
/// proxy TxId is freed. Causes include: PLC dropped the response, network packet loss,
/// or a backend that echoes the wrong MBAP TxId (e.g. pymodbus 3.13.0's
/// concurrent-multiplexed-request bug).
/// </summary>
[LoggerMessage(
EventId = 116,
EventName = "mbproxy.multiplex.request.timeout",
Level = LogLevel.Warning,
Message = "In-flight request timed out: Plc={Plc} ProxyTxId={ProxyTxId} OriginalTxId={OriginalTxId} Fc={Fc} ElapsedMs={ElapsedMs}")]
public static partial void RequestTimeout(
ILogger logger,
string plc,
ushort proxyTxId,
ushort originalTxId,
byte fc,
long elapsedMs);
}
@@ -0,0 +1,664 @@
using System.Collections.Concurrent;
using System.Diagnostics;
using System.Net.Sockets;
using System.Threading.Channels;
using Mbproxy.Options;
using Polly;
namespace Mbproxy.Proxy.Multiplexing;
/// <summary>
/// Owner of the single backend TCP connection to one PLC. Multiplexes many
/// <see cref="UpstreamPipe"/> instances onto that one socket by rewriting MBAP transaction
/// IDs so concurrent in-flight requests from different upstream clients remain
/// distinguishable on the shared wire. The multiplexer:
///
/// <list type="bullet">
/// <item><description>Opens and re-opens the backend socket through a Polly retry pipeline
/// that matches the <see cref="ResilienceOptions.BackendConnect"/> profile.</description></item>
/// <item><description>Runs one backend writer task that drains <see cref="_outboundChannel"/>
/// into the backend socket (single writer; no socket-level synchronisation needed).</description></item>
/// <item><description>Runs one backend reader task that decodes MBAP frames from the backend,
/// looks each frame up in the <see cref="CorrelationMap"/>, restores each interested
/// party's original TxId, and hands the frame to that party's
/// <see cref="UpstreamPipe._responseChannel"/>.</description></item>
/// <item><description>Cascades a backend disconnect by closing every attached pipe and
/// freeing every allocated proxy TxId, then waits for the next upstream request to
/// arrive (which triggers a fresh backend connect via Polly).</description></item>
/// </list>
///
/// <para><b>Threading invariants:</b> a single backend writer touches the backend socket
/// for sends; a single backend reader touches the same socket for receives. Per-upstream
/// read tasks call <see cref="OnUpstreamFrameAsync"/>, which allocates a proxy TxId, queues
/// the request frame into <see cref="_outboundChannel"/>, and returns. Upstream-side writes
/// flow through each pipe's response channel — never directly through this class.</para>
///
/// <para><b>Lifecycle:</b> the multiplexer is created with the backend offline. The first
/// <see cref="OnUpstreamFrameAsync"/> call (or the first <see cref="Attach"/> if you prefer
/// eager-start) triggers backend connect through the Polly pipeline. Subsequent in-flight
/// requests reuse the same socket. <see cref="DisposeAsync"/> tears down the backend
/// socket, the writer/reader tasks, and every attached pipe.</para>
/// </summary>
internal sealed class PlcMultiplexer : IAsyncDisposable, IMultiplexCountersProvider
{
private const int OutboundChannelCapacity = 256;
private readonly PlcOptions _plc;
private readonly ConnectionOptions _connectionOptions;
private readonly IPduPipeline _pipeline;
private readonly PerPlcContext _ctx;
private readonly ILogger<PlcMultiplexer> _logger;
private readonly ResiliencePipeline? _backendConnectPipeline;
private readonly TxIdAllocator _allocator = new();
private readonly CorrelationMap _correlation = new();
private readonly Channel<byte[]> _outboundChannel = Channel.CreateBounded<byte[]>(
new BoundedChannelOptions(OutboundChannelCapacity)
{
FullMode = BoundedChannelFullMode.Wait,
SingleReader = true,
SingleWriter = false,
});
// Attached pipes — Phase 9 needs the list for the status page; Phase 10 will need it for
// coalescing (fan-out). ConcurrentDictionary keyed on UpstreamPipe.Id for O(1) detach.
private readonly ConcurrentDictionary<Guid, UpstreamPipe> _pipes = new();
// Lifecycle plumbing. Backend tasks share a CTS; cascading disconnect cancels it,
// which terminates both the writer and reader tasks. The next call to
// EnsureBackendConnectedAsync constructs a fresh CTS and a fresh backend socket.
private readonly object _backendLock = new();
private Socket? _backendSocket;
private CancellationTokenSource? _backendCts;
private Task? _backendWriterTask;
private Task? _backendReaderTask;
private readonly CancellationTokenSource _disposeCts = new();
private bool _disposed;
private Task? _watchdogTask;
public PlcMultiplexer(
PlcOptions plc,
ConnectionOptions connectionOptions,
IPduPipeline pipeline,
PerPlcContext perPlcContext,
ILogger<PlcMultiplexer> logger,
ResiliencePipeline? backendConnectPipeline = null)
{
_plc = plc;
_connectionOptions = connectionOptions;
_pipeline = pipeline;
_ctx = perPlcContext;
_logger = logger;
_backendConnectPipeline = backendConnectPipeline;
// Register this multiplexer as the live telemetry source for the PLC's counters.
_ctx.Counters.SetMultiplexProvider(this);
// Spin up the per-request timeout watchdog. It scans the correlation map at a fixed
// interval and times out any in-flight request older than BackendRequestTimeoutMs.
// Critical for: lost responses, dead-PLC paths, and backends that mis-echo TxIds
// (e.g. pymodbus 3.13.0's concurrent-multiplexed-request bug — see test files).
_watchdogTask = Task.Run(() => RunRequestTimeoutWatchdogAsync(_disposeCts.Token), CancellationToken.None);
}
// ── IMultiplexCountersProvider ────────────────────────────────────────────
public long InFlightCount => _allocator.InFlightCount;
public long TxIdWraps => _allocator.WrapCount;
public long BackendQueueDepth => _outboundChannel.Reader.Count;
// ── Public surface ────────────────────────────────────────────────────────
/// <summary>
/// Read-only collection of currently-attached upstream pipes. Used by the status page.
/// </summary>
public IReadOnlyCollection<UpstreamPipe> AttachedPipes => _pipes.Values.ToArray();
/// <summary>
/// Attaches an upstream pipe to this multiplexer. The caller is responsible for
/// running the pipe's read+write loops (typically via <see cref="StartPipeAsync"/>)
/// which wires the pipe's OnFrame callback back into <see cref="OnUpstreamFrameAsync"/>.
/// </summary>
public void Attach(UpstreamPipe pipe)
{
if (_disposed)
throw new ObjectDisposedException(nameof(PlcMultiplexer));
_pipes[pipe.Id] = pipe;
}
/// <summary>
/// Starts the read+write tasks for <paramref name="pipe"/> and returns a task that
/// completes when the pipe's read loop ends. The multiplexer detaches the pipe when
/// its read loop returns.
/// </summary>
public Task StartPipeAsync(UpstreamPipe pipe, CancellationToken ct)
{
Attach(pipe);
// The write loop runs to completion when the pipe is disposed or the channel
// completes. We don't await it directly — it's joined inside DisposeAsync of the pipe.
_ = Task.Run(() => pipe.RunWriteLoopAsync(ct), CancellationToken.None);
var readLoop = pipe.RunReadLoopAsync(
(frame, frameCt) => OnUpstreamFrameAsync(pipe, frame, frameCt),
ct);
// When the pipe's read loop finishes, detach it. Don't dispose it here; the
// listener (or the cascade walker) owns disposal.
_ = readLoop.ContinueWith(prev =>
{
_pipes.TryRemove(pipe.Id, out _);
}, TaskScheduler.Default);
return readLoop;
}
/// <summary>
/// Tears down the multiplexer: closes the backend connection, cancels both backend
/// tasks, drains every in-flight correlation entry, and closes every attached pipe.
/// </summary>
public async ValueTask DisposeAsync()
{
if (_disposed) return;
_disposed = true;
// Stop the counters provider link so a status snapshot during teardown doesn't
// see live-but-soon-to-be-empty internal state.
_ctx.Counters.SetMultiplexProvider(null);
await _disposeCts.CancelAsync().ConfigureAwait(false);
// Best-effort join the watchdog so its in-flight log/dispatch settles before tests
// assert on counter state.
if (_watchdogTask is not null)
{
try { await _watchdogTask.WaitAsync(TimeSpan.FromSeconds(2)).ConfigureAwait(false); }
catch { /* swallow */ }
}
await TearDownBackendAsync("disposing", cascadeUpstreams: true).ConfigureAwait(false);
_outboundChannel.Writer.TryComplete();
// Dispose all attached pipes.
foreach (var pipe in _pipes.Values)
{
try { await pipe.DisposeAsync().ConfigureAwait(false); } catch { /* best effort */ }
}
_pipes.Clear();
_disposeCts.Dispose();
}
// ── Backend connect / teardown ────────────────────────────────────────────
private async Task<bool> EnsureBackendConnectedAsync(CancellationToken ct)
{
if (_disposed) return false;
// Fast path: already connected.
if (_backendSocket is { Connected: true } && _backendCts is { IsCancellationRequested: false })
return true;
// Serialise concurrent connect attempts from many upstream pipes.
await _connectGate.WaitAsync(ct).ConfigureAwait(false);
try
{
// Re-check after acquiring the gate.
if (_backendSocket is { Connected: true } && _backendCts is { IsCancellationRequested: false })
return true;
// Build a fresh backend socket and Polly-connect.
var backend = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp)
{ NoDelay = true };
try
{
if (_backendConnectPipeline is not null)
{
await _backendConnectPipeline.ExecuteAsync(async attemptToken =>
{
using var cts = CancellationTokenSource.CreateLinkedTokenSource(attemptToken);
cts.CancelAfter(_connectionOptions.BackendConnectTimeoutMs);
await backend.ConnectAsync(_plc.Host, _plc.Port, cts.Token).ConfigureAwait(false);
}, ct).ConfigureAwait(false);
}
else
{
using var connectCts = CancellationTokenSource.CreateLinkedTokenSource(ct);
connectCts.CancelAfter(_connectionOptions.BackendConnectTimeoutMs);
await backend.ConnectAsync(_plc.Host, _plc.Port, connectCts.Token).ConfigureAwait(false);
}
}
catch (Exception ex)
{
string reason = ex is OperationCanceledException
? $"Backend connect timed out or cancelled after {_connectionOptions.BackendConnectTimeoutMs} ms"
: ex.Message;
MultiplexerLogEvents.BackendFailed(_logger, _plc.Name, reason);
_ctx.Counters.IncrementConnectFailed();
backend.Dispose();
return false;
}
// Successful connect. Wire up the backend tasks.
var cts2 = CancellationTokenSource.CreateLinkedTokenSource(_disposeCts.Token);
lock (_backendLock)
{
_backendSocket = backend;
_backendCts = cts2;
_backendWriterTask = Task.Run(() => RunBackendWriterAsync(backend, cts2.Token), CancellationToken.None);
_backendReaderTask = Task.Run(() => RunBackendReaderAsync(backend, cts2.Token), CancellationToken.None);
}
_ctx.Counters.IncrementConnectSuccess();
MultiplexerLogEvents.BackendConnected(_logger, _plc.Name, _plc.Host, _plc.Port);
return true;
}
finally
{
_connectGate.Release();
}
}
private readonly SemaphoreSlim _connectGate = new(1, 1);
private async Task TearDownBackendAsync(string reason, bool cascadeUpstreams)
{
Socket? oldSocket;
CancellationTokenSource? oldCts;
Task? writer, reader;
lock (_backendLock)
{
oldSocket = _backendSocket;
oldCts = _backendCts;
writer = _backendWriterTask;
reader = _backendReaderTask;
_backendSocket = null;
_backendCts = null;
_backendWriterTask = null;
_backendReaderTask = null;
}
if (oldSocket is null && oldCts is null) return;
try { oldCts?.Cancel(); } catch { /* best effort */ }
try { oldSocket?.Shutdown(SocketShutdown.Both); } catch { /* already closed */ }
try { oldSocket?.Dispose(); } catch { /* best effort */ }
// Drain correlation map; cascade-close every interested upstream pipe.
var dropped = _correlation.DrainAll();
var cascadeIds = new HashSet<Guid>();
foreach (var kvp in dropped)
{
_allocator.Release(kvp.Key);
foreach (var party in kvp.Value.InterestedParties)
cascadeIds.Add(party.Pipe.Id);
}
int upstreamCount = 0;
if (cascadeUpstreams)
{
// Close every attached pipe that had a request in flight; the others will
// simply re-issue on next request through a fresh backend connect.
// Per the design doc, ALL attached upstreams cascade on backend disconnect.
upstreamCount = _pipes.Count;
// Snapshot keys before disposal modifies the dictionary indirectly.
var pipeList = _pipes.Values.ToArray();
foreach (var pipe in pipeList)
{
try { await pipe.DisposeAsync().ConfigureAwait(false); }
catch { /* best effort */ }
}
_pipes.Clear();
_ctx.Counters.AddDisconnectCascades(upstreamCount);
}
// Best-effort join.
try { if (writer is not null) await writer.WaitAsync(TimeSpan.FromSeconds(2)).ConfigureAwait(false); } catch { /* swallow */ }
try { if (reader is not null) await reader.WaitAsync(TimeSpan.FromSeconds(2)).ConfigureAwait(false); } catch { /* swallow */ }
oldCts?.Dispose();
if (upstreamCount > 0 || dropped.Count > 0)
MultiplexerLogEvents.BackendDisconnected(_logger, _plc.Name, upstreamCount, dropped.Count, reason);
}
// ── Backend writer / reader tasks ─────────────────────────────────────────
private async Task RunBackendWriterAsync(Socket backend, CancellationToken ct)
{
try
{
await foreach (var frame in _outboundChannel.Reader.ReadAllAsync(ct).ConfigureAwait(false))
{
int sent = 0;
while (sent < frame.Length)
{
int n = await backend.SendAsync(
frame.AsMemory(sent, frame.Length - sent),
SocketFlags.None,
ct).ConfigureAwait(false);
if (n == 0) throw new SocketException((int)SocketError.ConnectionReset);
sent += n;
}
}
}
catch (OperationCanceledException)
{
// Normal teardown.
}
catch (Exception ex)
{
// Backend failure — cascade.
_ = TearDownBackendAsync($"writer fault: {ex.Message}", cascadeUpstreams: true);
}
}
private async Task RunBackendReaderAsync(Socket backend, CancellationToken ct)
{
byte[] headerBuf = new byte[MbapFrame.HeaderSize];
try
{
while (!ct.IsCancellationRequested)
{
if (!await FillAsync(backend, headerBuf, 0, MbapFrame.HeaderSize, ct).ConfigureAwait(false))
break;
if (!MbapFrame.TryParseHeader(headerBuf.AsSpan(),
out ushort proxyTxId, out _, out ushort length, out _))
break;
if (length < 1)
{
// Degenerate frame — drop.
continue;
}
int pduBodyLen = length - 1;
if (pduBodyLen > MbapFrame.MaxPduBodySize)
{
// Frame too large — backend is misbehaving; force teardown.
_logger.LogWarning(
"Oversized backend frame: Plc={Plc} PduBody={Body} > Max={Max}",
_plc.Name, pduBodyLen, MbapFrame.MaxPduBodySize);
break;
}
byte[] frame = new byte[MbapFrame.HeaderSize + pduBodyLen];
Buffer.BlockCopy(headerBuf, 0, frame, 0, MbapFrame.HeaderSize);
if (!await FillAsync(backend, frame, MbapFrame.HeaderSize, pduBodyLen, ct).ConfigureAwait(false))
break;
if (!_correlation.TryRemove(proxyTxId, out var inFlight))
{
// No correlation entry — either a stale response after cascade, or
// the PLC sent something unsolicited. Drop the frame.
continue;
}
// Free the allocator slot immediately so it can be reused.
_allocator.Release(proxyTxId);
// Update EWMA round-trip from when we sent the request.
long elapsedMs = (DateTimeOffset.UtcNow - inFlight.SentAtUtc).Ticks * 100; // 100 ns per tick
// UpdateRoundTripEwma expects Stopwatch ticks, but we have wall-clock.
// Convert ms back to Stopwatch ticks:
long ticks = (long)((double)(DateTimeOffset.UtcNow - inFlight.SentAtUtc).TotalSeconds * Stopwatch.Frequency);
if (ticks > 0)
_ctx.Counters.UpdateRoundTripEwma(ticks);
// Apply the BCD rewriter on the response. Build a per-call context clone
// that carries CurrentRequest so the rewriter can decode FC03/04 slots.
var responseCtx = _ctx.WithCurrentRequest(inFlight);
_pipeline.Process(
MbapDirection.ResponseToClient,
frame.AsSpan(0, MbapFrame.HeaderSize),
frame.AsSpan(MbapFrame.HeaderSize, pduBodyLen),
responseCtx);
// Fan out to each interested party with their original TxId restored.
// Phase 9: always exactly one party. Phase 10: N parties (read coalescing).
foreach (var party in inFlight.InterestedParties)
{
if (!party.Pipe.IsAlive)
continue;
// The frame buffer is private to this iteration; if there are multiple
// parties (Phase 10), each gets its own copy with its own original TxId
// patched in. Phase 9 always has Count == 1, so the single-buffer path
// is the common case; we copy to keep Phase-10 forward compatibility.
byte[] outFrame = inFlight.InterestedParties.Count == 1
? frame
: (byte[])frame.Clone();
outFrame[0] = (byte)(party.OriginalTxId >> 8);
outFrame[1] = (byte)(party.OriginalTxId & 0xFF);
await party.Pipe.SendResponseAsync(outFrame, ct).ConfigureAwait(false);
}
}
// Reader exited cleanly — backend closed by remote. Cascade.
_ = TearDownBackendAsync("backend reader EOF", cascadeUpstreams: true);
}
catch (OperationCanceledException)
{
// Normal teardown.
}
catch (Exception ex)
{
_ = TearDownBackendAsync($"reader fault: {ex.Message}", cascadeUpstreams: true);
}
}
// ── Upstream → multiplexer entry point ────────────────────────────────────
private async ValueTask OnUpstreamFrameAsync(UpstreamPipe pipe, byte[] frame, CancellationToken ct)
{
if (_disposed) return;
// Ensure backend is connected. Failure here means we cannot service the request;
// close the upstream pipe (consistent with the 1:1 model's behaviour on connect
// failure).
if (!await EnsureBackendConnectedAsync(ct).ConfigureAwait(false))
{
try { await pipe.DisposeAsync().ConfigureAwait(false); } catch { /* best effort */ }
return;
}
if (frame.Length < MbapFrame.HeaderSize)
return;
if (!MbapFrame.TryParseHeader(frame.AsSpan(0, MbapFrame.HeaderSize),
out ushort originalTxId, out _, out _, out byte unitId))
return;
if (!_allocator.TryAllocate(out ushort proxyTxId))
{
MultiplexerLogEvents.Saturated(_logger, _plc.Name, pipe.RemoteEp?.ToString() ?? "?");
// Synthesize Modbus exception 04 (Slave Device Failure).
byte fc = frame.Length > MbapFrame.HeaderSize ? frame[MbapFrame.HeaderSize] : (byte)0;
byte[] excFrame = BuildExceptionFrame(originalTxId, unitId, fc, exceptionCode: 4);
await pipe.SendResponseAsync(excFrame, ct).ConfigureAwait(false);
return;
}
// Parse the PDU FC + start/qty (for FC03/04) so the response decoder has the
// correlation it needs.
int pduOffset = MbapFrame.HeaderSize;
byte fcByte = frame[pduOffset];
ushort startAddr = 0;
ushort qty = 0;
if (fcByte is 0x03 or 0x04 && frame.Length >= pduOffset + 5)
{
startAddr = (ushort)((frame[pduOffset + 1] << 8) | frame[pduOffset + 2]);
qty = (ushort)((frame[pduOffset + 3] << 8) | frame[pduOffset + 4]);
}
var inFlight = new InFlightRequest(
UnitId: unitId,
Fc: fcByte,
StartAddress: startAddr,
Qty: qty,
InterestedParties: [new InterestedParty(pipe, originalTxId)],
SentAtUtc: DateTimeOffset.UtcNow);
if (!_correlation.TryAdd(proxyTxId, inFlight))
{
// Should be impossible: the allocator just guaranteed proxyTxId is free.
_allocator.Release(proxyTxId);
_logger.LogError("CorrelationMap.TryAdd failed for already-free proxyTxId {ProxyTxId}", proxyTxId);
return;
}
// Peak in-flight tracking.
_ctx.Counters.ObserveInFlight(_allocator.InFlightCount);
// Apply the BCD rewriter on the request. Use a per-call context with CurrentRequest
// (the rewriter doesn't currently need it on request, but Phase 10 may).
var requestCtx = _ctx.WithCurrentRequest(inFlight);
_pipeline.Process(
MbapDirection.RequestToBackend,
frame.AsSpan(0, MbapFrame.HeaderSize),
frame.AsSpan(MbapFrame.HeaderSize, frame.Length - MbapFrame.HeaderSize),
requestCtx);
// Overwrite the MBAP TxId with the proxy TxId.
frame[0] = (byte)(proxyTxId >> 8);
frame[1] = (byte)(proxyTxId & 0xFF);
// Enqueue for the backend writer task.
try
{
await _outboundChannel.Writer.WriteAsync(frame, ct).ConfigureAwait(false);
}
catch (ChannelClosedException)
{
// Channel completed during shutdown — release the proxy TxId.
if (_correlation.TryRemove(proxyTxId, out _))
_allocator.Release(proxyTxId);
}
}
// ── Per-request timeout watchdog ──────────────────────────────────────────
/// <summary>
/// Periodically scans the correlation map for in-flight requests whose response has
/// not arrived within <see cref="ConnectionOptions.BackendRequestTimeoutMs"/>. For each
/// stale entry: removes it from the map, frees its allocator slot, and delivers a
/// Modbus exception (code 0x0B / Gateway Target Device Failed To Respond) to each
/// interested party with the original TxId restored.
///
/// <para><b>Why this exists.</b> In the 1:1 connection model, a lost response would
/// fault the dedicated backend socket and the upstream pair would close. The multiplexed
/// model needs an explicit per-request timer because a single missing or mis-routed
/// response would otherwise leak a correlation entry forever and hang the upstream
/// pipe indefinitely. Real-world causes: PLC drops a response, network packet loss,
/// backend that mis-echoes MBAP TxIds.</para>
/// </summary>
private async Task RunRequestTimeoutWatchdogAsync(CancellationToken ct)
{
// Tick at ~quarter of the request timeout for responsive cleanup, but cap to a
// 1-second floor so the watchdog doesn't busy-wake on very small timeouts.
int tickMs = Math.Max(100, _connectionOptions.BackendRequestTimeoutMs / 4);
try
{
while (!ct.IsCancellationRequested)
{
await Task.Delay(tickMs, ct).ConfigureAwait(false);
var threshold = DateTimeOffset.UtcNow.AddMilliseconds(-_connectionOptions.BackendRequestTimeoutMs);
var stale = _correlation.SnapshotOlderThan(threshold);
if (stale.Count == 0) continue;
foreach (var kvp in stale)
{
ushort proxyTxId = kvp.Key;
// Try to claim the entry; if another path (response, cascade) already removed it,
// skip — no work to do.
if (!_correlation.TryRemove(proxyTxId, out var req))
continue;
_allocator.Release(proxyTxId);
long elapsedMs = (long)(DateTimeOffset.UtcNow - req.SentAtUtc).TotalMilliseconds;
foreach (var party in req.InterestedParties)
{
MultiplexerLogEvents.RequestTimeout(
_logger, _plc.Name, proxyTxId, party.OriginalTxId, req.Fc, elapsedMs);
if (!party.Pipe.IsAlive)
continue;
// Deliver Modbus exception 0x0B (Gateway Target Device Failed To Respond)
// to the upstream client. This lets the client's library raise a clean
// ModbusException rather than hanging on a timeout.
byte[] excFrame = BuildExceptionFrame(party.OriginalTxId, req.UnitId, req.Fc, exceptionCode: 0x0B);
try
{
await party.Pipe.SendResponseAsync(excFrame, ct).ConfigureAwait(false);
}
catch
{
// Best-effort delivery; if the pipe is going down, the client
// discovers the failure through its own socket close path.
}
}
}
}
}
catch (OperationCanceledException)
{
// Normal teardown.
}
catch (Exception ex)
{
_logger.LogError(ex, "Request-timeout watchdog faulted: Plc={Plc}", _plc.Name);
}
}
// ── Helpers ───────────────────────────────────────────────────────────────
private static async Task<bool> FillAsync(
Socket socket, byte[] buf, int offset, int count, CancellationToken ct)
{
int remaining = count;
while (remaining > 0)
{
int n = await socket.ReceiveAsync(
buf.AsMemory(offset + (count - remaining), remaining),
SocketFlags.None, ct).ConfigureAwait(false);
if (n == 0) return false;
remaining -= n;
}
return true;
}
private static byte[] BuildExceptionFrame(ushort originalTxId, byte unitId, byte fc, byte exceptionCode)
{
// Modbus exception PDU = [fc | 0x80][exceptionCode].
// MBAP length covers UnitId (1) + PDU (2) = 3.
var frame = new byte[MbapFrame.HeaderSize + 2];
frame[0] = (byte)(originalTxId >> 8);
frame[1] = (byte)(originalTxId & 0xFF);
frame[2] = 0; // ProtocolId
frame[3] = 0;
frame[4] = 0; // Length high
frame[5] = 3; // Length low: UnitId(1) + ExFc(1) + ExCode(1)
frame[6] = unitId;
frame[7] = (byte)(fc | 0x80);
frame[8] = exceptionCode;
return frame;
}
}
@@ -0,0 +1,142 @@
namespace Mbproxy.Proxy.Multiplexing;
/// <summary>
/// Allocates 16-bit MBAP transaction IDs (proxy TxIds) used to multiplex many upstream
/// clients onto a single shared backend connection per PLC. The allocator tracks which
/// IDs are currently in flight and scans forward from a rolling cursor to find the next
/// free slot, mimicking the natural cadence of Modbus clients while keeping reuse
/// distance maximally large in steady state.
///
/// <para>State is protected by a single <see cref="object"/> lock. Contention is
/// negligible in practice — the allocator is per-PLC and one PLC's wire rate is bounded
/// by the controller's internal scan time (a few ms per request on an H2-ECOM100).
/// The lock is preferred over a lock-free approach for readability and worst-case
/// determinism (Polly retries, cascade cleanup, and saturation paths must not race).</para>
///
/// <para><b>Memory:</b> <c>bool[65536]</c> (~64 KB) per PLC. With ~54 PLCs that is
/// ~3.4 MB total — well within budget for a service that already ships at ~30 MB working
/// set under load.</para>
///
/// <para><b>Wrap counter:</b> increments every time the rolling cursor rolls over
/// 0xFFFF → 0x0000 during a successful allocation scan. Frequent wraps indicate either
/// very high churn or extreme in-flight depth and are surfaced as a telemetry signal,
/// not an error.</para>
/// </summary>
internal sealed class TxIdAllocator
{
// 65,536 slots total — the full uint16 space.
private const int SlotCount = 65536;
private readonly object _lock = new();
private readonly bool[] _inUse = new bool[SlotCount];
private ushort _next; // rolling cursor; 0 on construction
private int _inFlightCount; // 0..65536
private long _wrapCount; // monotonic; never resets
/// <summary>
/// Number of currently-in-flight proxy TxIds (i.e., allocated but not yet released).
/// Read under the same lock that mutates it; the snapshot is a simple atomic read of
/// an int but we still hold the lock for cross-field consistency with <c>_inUse</c>.
/// </summary>
public int InFlightCount
{
get
{
lock (_lock)
{
return _inFlightCount;
}
}
}
/// <summary>
/// Number of times the rolling cursor has wrapped 0xFFFF → 0x0000 during a
/// successful allocation since the allocator was constructed. Read without locking
/// via <see cref="Interlocked.Read"/> for the hot status-page path.
/// </summary>
public long WrapCount => Interlocked.Read(ref _wrapCount);
/// <summary>
/// Attempts to allocate the next free proxy TxId.
/// Returns <c>true</c> with <paramref name="id"/> set when an ID was allocated.
/// Returns <c>false</c> when every slot in the 16-bit space is currently in use;
/// the caller is responsible for emitting <c>mbproxy.multiplex.saturated</c> and
/// returning a Modbus exception (code 04 / Slave Device Failure) to the upstream.
/// </summary>
public bool TryAllocate(out ushort id)
{
lock (_lock)
{
if (_inFlightCount >= SlotCount)
{
id = 0;
return false;
}
// Scan forward from _next for the next free slot. _inFlightCount < SlotCount
// guarantees at least one free slot, so the loop terminates within at most
// SlotCount iterations even in the pathological full-minus-one case.
ushort start = _next;
ushort cursor = start;
do
{
if (!_inUse[cursor])
{
_inUse[cursor] = true;
_inFlightCount++;
// Advance the cursor; track wrap.
unchecked
{
ushort nextCursor = (ushort)(cursor + 1);
if (nextCursor == 0)
Interlocked.Increment(ref _wrapCount);
_next = nextCursor;
}
id = cursor;
return true;
}
unchecked
{
cursor = (ushort)(cursor + 1);
}
}
while (cursor != start);
// Defensive: should be unreachable given the InFlightCount check above.
id = 0;
return false;
}
}
/// <summary>
/// Releases a previously-allocated proxy TxId. Releasing an ID that is not currently
/// allocated is a no-op (defensive: cascade-on-disconnect can call <see cref="Release"/>
/// after a concurrent timeout path has already done so).
/// </summary>
public void Release(ushort id)
{
lock (_lock)
{
if (_inUse[id])
{
_inUse[id] = false;
_inFlightCount--;
}
}
}
/// <summary>
/// Test-only: returns whether the given proxy TxId is currently marked in use.
/// Internal so it remains usable from unit tests via InternalsVisibleTo.
/// </summary>
internal bool IsAllocated(ushort id)
{
lock (_lock)
{
return _inUse[id];
}
}
}
@@ -0,0 +1,281 @@
using System.Net;
using System.Net.Sockets;
using System.Threading.Channels;
namespace Mbproxy.Proxy.Multiplexing;
/// <summary>
/// One accepted upstream client socket, exposed as an asynchronous frame pipe to the
/// owning <see cref="PlcMultiplexer"/>. The pipe reads complete MBAP frames from the
/// upstream socket and hands each frame to a multiplexer-supplied <c>onFrame</c> callback;
/// it also exposes a write channel that the multiplexer drains to send response frames
/// back to the upstream client.
///
/// <para><b>Lifecycle:</b> constructed by <see cref="PlcListener"/> on accept; attached
/// to the multiplexer; runs its read loop until the upstream socket closes, the pipe is
/// disposed, or the multiplexer cascades a backend disconnect.</para>
///
/// <para><b>Concurrency model:</b> each pipe runs exactly two tasks — a read task and a
/// write task. The read task drives the multiplexer (one frame at a time, which preserves
/// the per-upstream-client one-in-flight invariant); the write task drains
/// <see cref="_responseChannel"/> and writes each frame to the socket. No third task ever
/// touches the socket.</para>
///
/// <para><b>One-in-flight-per-upstream:</b> the read loop processes frames sequentially.
/// A multi-PDU-pipelined client would still get correct service because the multiplexer
/// can have multiple distinct <c>OnFrame</c> calls outstanding from <i>different</i>
/// upstream pipes; a single upstream cannot multi-PDU-pipeline itself.</para>
/// </summary>
internal sealed partial class UpstreamPipe : IAsyncDisposable
{
// Capacity 16: enough to buffer responses while the upstream's TCP send buffer drains,
// small enough that backpressure kicks in on a wedged consumer. Drop-on-fault behaviour
// applies — if the upstream is dead, _alive flips to false and pending writes are
// discarded by the multiplexer before they ever enter the channel.
private const int ResponseChannelCapacity = 16;
private readonly Socket _upstream;
private readonly ILogger _logger;
private readonly string _plcName;
private readonly Channel<byte[]> _responseChannel = Channel.CreateBounded<byte[]>(
new BoundedChannelOptions(ResponseChannelCapacity)
{
FullMode = BoundedChannelFullMode.Wait, // backpressure, not drop
SingleReader = true,
SingleWriter = false, // multiplexer adds; potential future paths too
});
// Internal CTS lets the multiplexer signal "drop this pipe now" without waiting for
// the upstream socket to close cleanly.
private readonly CancellationTokenSource _cts = new();
private bool _disposed;
// Phase 9: per-pipe forwarded-PDU counter (replaces the per-pair counter from the
// 1:1 model). Read by the status page.
private long _pdusForwardedCount;
/// <summary>Stable identity for status-page reporting and cascade cleanup.</summary>
public Guid Id { get; } = Guid.NewGuid();
/// <summary>The upstream client's remote endpoint, captured at construction.</summary>
public IPEndPoint? RemoteEp { get; }
/// <summary>UTC time at which the upstream socket was accepted.</summary>
public DateTimeOffset ConnectedAtUtc { get; } = DateTimeOffset.UtcNow;
/// <summary>
/// Number of request PDUs read from this upstream and forwarded into the multiplexer.
/// Incremented by <see cref="RunReadLoopAsync"/> after each successful frame parse.
/// </summary>
public long PdusForwardedCount => Interlocked.Read(ref _pdusForwardedCount);
/// <summary>
/// <c>true</c> while the pipe's read+write tasks are running. Flips to <c>false</c>
/// on disposal or any fault on either direction.
/// </summary>
public bool IsAlive => !_disposed && !_cts.IsCancellationRequested;
public UpstreamPipe(Socket upstream, string plcName, ILogger logger)
{
_upstream = upstream;
_upstream.NoDelay = true;
RemoteEp = upstream.RemoteEndPoint as IPEndPoint;
_plcName = plcName;
_logger = logger;
string remoteStr = RemoteEp?.ToString() ?? "?";
MultiplexerLogEvents.ClientConnected(_logger, _plcName, remoteStr);
}
/// <summary>
/// Runs the read side of the pipe. Reads complete MBAP frames from the upstream
/// socket and invokes <paramref name="onFrame"/> for each. Returns when:
/// <list type="bullet">
/// <item><description>The upstream closes cleanly (clean EOF on the first byte of a frame).</description></item>
/// <item><description>The pipe is disposed (CTS fires).</description></item>
/// <item><description>An exception is thrown by <paramref name="onFrame"/>.</description></item>
/// </list>
///
/// <para>The frame buffer is owned by this loop; <paramref name="onFrame"/> receives
/// a fresh <see cref="byte"/>[] each call (the multiplexer needs to retain a copy to
/// build <see cref="InFlightRequest"/>, so we don't try to share the buffer).</para>
/// </summary>
public async Task RunReadLoopAsync(
Func<byte[], CancellationToken, ValueTask> onFrame,
CancellationToken ct)
{
using var linked = CancellationTokenSource.CreateLinkedTokenSource(ct, _cts.Token);
var token = linked.Token;
// 7-byte header + max 253-byte PDU body = 260 bytes per frame.
byte[] headerBuf = new byte[MbapFrame.HeaderSize];
try
{
while (!token.IsCancellationRequested)
{
// Read the 7-byte MBAP header.
if (!await FillAsync(_upstream, headerBuf, 0, MbapFrame.HeaderSize, token).ConfigureAwait(false))
return; // clean EOF — upstream went away.
if (!MbapFrame.TryParseHeader(headerBuf.AsSpan(),
out _, out _, out ushort length, out _))
return;
if (length < 1)
{
// Length field claims no body — forward the header alone via a fresh buffer.
byte[] degenerate = new byte[MbapFrame.HeaderSize];
Buffer.BlockCopy(headerBuf, 0, degenerate, 0, MbapFrame.HeaderSize);
await onFrame(degenerate, token).ConfigureAwait(false);
Interlocked.Increment(ref _pdusForwardedCount);
continue;
}
int pduBodyLen = length - 1;
if (pduBodyLen > MbapFrame.MaxPduBodySize)
{
// Frame too large for the buffer — close the upstream.
_logger.LogWarning(
"Oversized upstream frame: Plc={Plc} PduBody={Body} > Max={Max}",
_plcName, pduBodyLen, MbapFrame.MaxPduBodySize);
return;
}
// Allocate a fresh frame buffer per PDU; the multiplexer retains it.
byte[] frame = new byte[MbapFrame.HeaderSize + pduBodyLen];
Buffer.BlockCopy(headerBuf, 0, frame, 0, MbapFrame.HeaderSize);
if (!await FillAsync(_upstream, frame, MbapFrame.HeaderSize, pduBodyLen, token)
.ConfigureAwait(false))
return;
Interlocked.Increment(ref _pdusForwardedCount);
await onFrame(frame, token).ConfigureAwait(false);
}
}
catch (OperationCanceledException)
{
// Normal shutdown.
}
catch (SocketException)
{
// Upstream socket closed by remote end — normal.
}
catch (ObjectDisposedException)
{
// Socket disposed by write loop or DisposeAsync — normal.
}
}
/// <summary>
/// Runs the write side of the pipe. Drains <see cref="_responseChannel"/> and writes
/// each frame to the upstream socket. Returns when the channel completes or the
/// upstream socket fails.
/// </summary>
public async Task RunWriteLoopAsync(CancellationToken ct)
{
using var linked = CancellationTokenSource.CreateLinkedTokenSource(ct, _cts.Token);
var token = linked.Token;
try
{
await foreach (var frame in _responseChannel.Reader.ReadAllAsync(token).ConfigureAwait(false))
{
await SendAllAsync(_upstream, frame.AsMemory(), token).ConfigureAwait(false);
}
}
catch (OperationCanceledException)
{
// Normal shutdown.
}
catch (SocketException)
{
// Upstream remote closed — normal.
}
catch (ObjectDisposedException)
{
// Socket disposed elsewhere — normal.
}
}
/// <summary>
/// Enqueues <paramref name="frame"/> for delivery on the upstream socket. Returns
/// without blocking when the pipe is no longer alive (the multiplexer will discover
/// the dead pipe on its next correlation lookup and drop responses bound for it).
/// </summary>
public async ValueTask SendResponseAsync(byte[] frame, CancellationToken ct)
{
if (!IsAlive)
return;
try
{
await _responseChannel.Writer.WriteAsync(frame, ct).ConfigureAwait(false);
}
catch (ChannelClosedException)
{
// Pipe disposed mid-write — drop silently.
}
catch (OperationCanceledException)
{
// Caller cancelled — drop silently.
}
}
/// <summary>
/// Closes the pipe: cancels the read+write loops and shuts down the socket. Idempotent.
/// </summary>
public async ValueTask DisposeAsync()
{
if (_disposed) return;
_disposed = true;
try { _responseChannel.Writer.TryComplete(); } catch { /* already complete */ }
await _cts.CancelAsync().ConfigureAwait(false);
try { _upstream.Shutdown(SocketShutdown.Both); } catch { /* already closed */ }
_upstream.Dispose();
_cts.Dispose();
string remoteStr = RemoteEp?.ToString() ?? "?";
MultiplexerLogEvents.ClientDisconnected(_logger, _plcName, remoteStr, "Pipe disposed");
}
// ── Low-level I/O helpers ─────────────────────────────────────────────────────
private static async Task<bool> FillAsync(
Socket socket, byte[] buf, int offset, int count, CancellationToken ct)
{
int remaining = count;
bool firstRead = true;
while (remaining > 0)
{
int received = await socket.ReceiveAsync(
buf.AsMemory(offset + (count - remaining), remaining),
SocketFlags.None,
ct).ConfigureAwait(false);
if (received == 0)
return firstRead && remaining == count ? false : false;
remaining -= received;
firstRead = false;
}
return true;
}
private static async Task SendAllAsync(Socket socket, Memory<byte> memory, CancellationToken ct)
{
while (memory.Length > 0)
{
int sent = await socket.SendAsync(memory, SocketFlags.None, ct).ConfigureAwait(false);
if (sent == 0) throw new SocketException((int)SocketError.ConnectionReset);
memory = memory[sent..];
}
}
}
@@ -0,0 +1,19 @@
namespace Mbproxy.Proxy;
/// <summary>
/// No-op PDU pipeline: passes every frame through byte-for-byte without rewriting.
/// Registered as the <see cref="IPduPipeline"/> singleton in Phase 03.
/// Phase 04 replaces this registration with BcdPduPipeline.
/// </summary>
internal sealed class NoopPduPipeline : IPduPipeline
{
public void Process(
MbapDirection direction,
ReadOnlySpan<byte> mbapHeader,
Span<byte> pdu,
PduContext context)
{
// Intentional no-op: bytes forwarded unmodified.
// Phase 04: replace this registration with BcdPduPipeline.
}
}
@@ -0,0 +1,60 @@
using Mbproxy.Bcd;
using Mbproxy.Proxy.Multiplexing;
namespace Mbproxy.Proxy;
/// <summary>
/// Per-PLC context holding the resolved BCD tag map, live counters, and a logger.
/// Derives from <see cref="PduContext"/> so it can be passed wherever a
/// <see cref="PduContext"/> is expected.
///
/// One instance per configured PLC is constructed at <see cref="ProxyWorker"/> startup
/// and lives for the lifetime of the listener. It is shared across all upstream pipes
/// served by the same <see cref="Multiplexing.PlcMultiplexer"/>; all mutable state is
/// accessed through <see cref="ProxyCounters"/> which uses Interlocked for thread-safety.
///
/// <para><b>Phase 9 — request correlation:</b> the multiplexer sets <see cref="CurrentRequest"/>
/// before calling the pipeline on each direction. On the request path the pipeline can
/// peek at the future correlation entry it just enqueued; on the response path the pipeline
/// uses the request's <c>StartAddress</c>/<c>Qty</c> to decode FC03/FC04 BCD slots. Different
/// in-flight responses use different <see cref="InFlightRequest"/> instances, so there is no
/// cross-talk between concurrent multiplexed requests.</para>
///
/// <para><b>Concurrency:</b> a single <see cref="PerPlcContext"/> instance is shared across
/// the per-upstream read tasks (which call the pipeline on the request path) and the
/// single backend reader task (which calls the pipeline on the response path). Because the
/// per-call <see cref="CurrentRequest"/> would be racy if mutated on the shared context,
/// the multiplexer constructs a lightweight per-call clone (<see cref="WithCurrentRequest"/>)
/// for each pipeline invocation. The shared mutable state — the tag map, counters, logger —
/// is read-only or Interlocked.</para>
/// </summary>
internal class PerPlcContext : PduContext
{
public BcdTagMap TagMap { get; init; } = BcdTagMap.Empty;
public ProxyCounters Counters { get; init; } = new();
public ILogger Logger { get; init; } = Microsoft.Extensions.Logging.Abstractions.NullLogger.Instance;
/// <summary>
/// Per-PDU-call correlation entry. Non-null on response calls (set by the multiplexer's
/// backend reader task to the matched <see cref="InFlightRequest"/>); <c>null</c> on
/// request calls. The BCD rewriter reads this on response to learn the originating
/// FC03/FC04 start address and quantity (which are not present in the response PDU).
/// </summary>
internal InFlightRequest? CurrentRequest { get; init; }
/// <summary>
/// Returns a shallow clone of this context with <see cref="CurrentRequest"/> set to
/// <paramref name="req"/>. The clone is cheap (one allocation per response) and avoids
/// any race on the shared context across concurrent multiplexed responses.
/// </summary>
internal PerPlcContext WithCurrentRequest(InFlightRequest? req) => new()
{
PlcName = PlcName,
TagMap = TagMap,
Counters = Counters,
Logger = Logger,
CurrentRequest = req,
};
}
+188
View File
@@ -0,0 +1,188 @@
using System.Collections.Concurrent;
using System.Net;
using System.Net.Sockets;
using Mbproxy.Options;
using Mbproxy.Proxy.Multiplexing;
using Polly;
namespace Mbproxy.Proxy;
/// <summary>
/// Owns one <see cref="TcpListener"/> bound to a PLC's configured listen port and one
/// <see cref="PlcMultiplexer"/> that owns the single backend connection to the PLC.
///
/// <para><b>Phase 9 — TxId multiplexing:</b> the listener no longer pairs each upstream
/// socket with a dedicated backend socket. Instead, every accepted upstream is wrapped
/// in an <see cref="UpstreamPipe"/> and handed to the multiplexer. The multiplexer holds
/// at most one TCP connection to the PLC, eliminating the H2-ECOM100's 4-concurrent-client
/// cap from the upstream side.</para>
///
/// <para>The listener's accept loop is otherwise unchanged. <see cref="StartAsync"/>
/// binds the socket; <see cref="RunAsync"/> runs until cancelled or the listener faults;
/// <see cref="DisposeAsync"/> tears down both the listener and the multiplexer.</para>
/// </summary>
internal sealed partial class PlcListener : IAsyncDisposable
{
private readonly PlcOptions _plc;
private readonly ConnectionOptions _connectionOptions;
private readonly IPduPipeline _pipeline;
private readonly ILogger<PlcListener> _listenerLogger;
private readonly ILogger<PlcMultiplexer> _multiplexerLogger;
private readonly ILogger _pipeLogger;
private readonly PerPlcContext? _perPlcContext;
private readonly ResiliencePipeline? _backendConnectPipeline;
private TcpListener? _listener;
private PlcMultiplexer? _multiplexer;
private bool _disposed;
// Track active pipe-handling tasks so DisposeAsync can wait for them.
private readonly ConcurrentDictionary<Guid, Task> _pipeTasks = new();
/// <summary>
/// Live collection of active <see cref="UpstreamPipe"/> instances for this listener.
/// Consumed by the status page to report per-client telemetry. Empty when the
/// multiplexer has not yet been constructed (e.g., between StopAsync and a fresh start).
/// </summary>
public IReadOnlyCollection<UpstreamPipe> ActiveUpstreams
=> _multiplexer?.AttachedPipes ?? Array.Empty<UpstreamPipe>();
public PlcListener(
PlcOptions plc,
ConnectionOptions connectionOptions,
IPduPipeline pipeline,
ILogger<PlcListener> listenerLogger,
ILogger<PlcMultiplexer> multiplexerLogger,
ILogger pipeLogger,
PerPlcContext? perPlcContext = null,
ResiliencePipeline? backendConnectPipeline = null)
{
_plc = plc;
_connectionOptions = connectionOptions;
_pipeline = pipeline;
_listenerLogger = listenerLogger;
_multiplexerLogger = multiplexerLogger;
_pipeLogger = pipeLogger;
_perPlcContext = perPlcContext;
_backendConnectPipeline = backendConnectPipeline;
}
/// <summary>
/// Binds the listen socket. Throws <see cref="SocketException"/> on bind failure;
/// the caller (<see cref="Supervision.PlcListenerSupervisor"/>) catches and logs
/// <c>mbproxy.startup.bind.failed</c>.
/// </summary>
public void StartAsync()
{
var endpoint = new IPEndPoint(IPAddress.Any, _plc.ListenPort);
_listener = new TcpListener(endpoint);
_listener.Start();
LogBound(_listenerLogger, _plc.Name, _plc.ListenPort);
// The multiplexer needs a PerPlcContext to share the BCD tag map and counters with
// the pipeline. If the caller (typically a test or pre-Phase-6 startup path) didn't
// supply one, construct a minimal context that exposes only the PlcName so the
// multiplexer + a noop/passthrough pipeline still round-trip frames correctly.
var ctx = _perPlcContext ?? new PerPlcContext
{
PlcName = _plc.Name,
Logger = _pipeLogger,
};
_multiplexer = new PlcMultiplexer(
_plc,
_connectionOptions,
_pipeline,
ctx,
_multiplexerLogger,
_backendConnectPipeline);
}
/// <summary>
/// Runs the accept loop until <paramref name="ct"/> is cancelled or the listener
/// faults. On accept, wraps the socket in an <see cref="UpstreamPipe"/> and attaches
/// it to the multiplexer.
/// </summary>
public async Task RunAsync(CancellationToken ct)
{
if (_listener is null)
throw new InvalidOperationException("StartAsync must be called before RunAsync.");
if (_multiplexer is null)
throw new InvalidOperationException("StartAsync must construct the multiplexer before RunAsync.");
try
{
while (!ct.IsCancellationRequested)
{
Socket upstream = await _listener.AcceptSocketAsync(ct).ConfigureAwait(false);
var pipe = new UpstreamPipe(upstream, _plc.Name, _pipeLogger);
var pipeTask = Task.Run(async () =>
{
try
{
await _multiplexer.StartPipeAsync(pipe, ct).ConfigureAwait(false);
}
finally
{
await pipe.DisposeAsync().ConfigureAwait(false);
}
}, CancellationToken.None);
_pipeTasks[pipe.Id] = pipeTask;
_ = pipeTask.ContinueWith(prev => _pipeTasks.TryRemove(pipe.Id, out _), TaskScheduler.Default);
}
}
catch (OperationCanceledException)
{
// Normal shutdown.
}
catch (Exception ex)
{
// Listener faulted — log and return. The supervisor will restart.
LogListenerFaulted(_listenerLogger, _plc.Name, _plc.ListenPort, ex.Message);
}
}
// ── IAsyncDisposable ──────────────────────────────────────────────────────────────────
public async ValueTask DisposeAsync()
{
if (_disposed) return;
_disposed = true;
_listener?.Stop();
if (_multiplexer is not null)
{
await _multiplexer.DisposeAsync().ConfigureAwait(false);
_multiplexer = null;
}
Task[] snapshot = _pipeTasks.Values.ToArray();
if (snapshot.Length > 0)
{
using var timeout = new CancellationTokenSource(TimeSpan.FromSeconds(5));
try
{
await Task.WhenAll(snapshot)
.WaitAsync(timeout.Token)
.ConfigureAwait(false);
}
catch
{
// Best effort.
}
}
}
// ── Logging ───────────────────────────────────────────────────────────────────────────
[LoggerMessage(EventId = 20, EventName = "mbproxy.startup.bind",
Level = LogLevel.Information, Message = "Listener bound: Plc={Plc} Port={Port}")]
private static partial void LogBound(ILogger logger, string plc, int port);
[LoggerMessage(EventId = 22, EventName = "mbproxy.listener.faulted",
Level = LogLevel.Error, Message = "Listener faulted: Plc={Plc} Port={Port} Reason={Reason}")]
private static partial void LogListenerFaulted(ILogger logger, string plc, int port, string reason);
}
+336
View File
@@ -0,0 +1,336 @@
namespace Mbproxy.Proxy;
/// <summary>
/// Immutable snapshot of per-PLC counters. Consumed by Phase 07's status page.
/// All fields are point-in-time reads; no ordering guarantees across fields.
///
/// <para><b>Backwards-compat policy (see docs/kpi.md):</b> fields are <i>added</i>, never
/// renamed or removed. Phase 9 appended <c>InFlightCount</c>, <c>MaxInFlight</c>,
/// <c>TxIdWraps</c>, <c>BackendDisconnectCascades</c>, and <c>BackendQueueDepth</c> for
/// the TxId-multiplexer telemetry surface (Tier 1.6 in docs/kpi.md).</para>
/// </summary>
public sealed record CounterSnapshot(
long PdusForwarded,
long Fc03,
long Fc04,
long Fc06,
long Fc16,
long FcOther,
long RewrittenSlots,
long PartialBcdWarnings,
long InvalidBcdWarnings,
long BackendException01,
long BackendException02,
long BackendException03,
long BackendException04,
long BackendExceptionOther,
long BytesUpstreamIn,
long BytesUpstreamOut,
/// <summary>
/// Total number of failed listener bind attempts over the lifetime of the supervisor.
/// Accumulates; never resets. See <see cref="SupervisorSnapshot.RecoveryAttempts"/> doc.
/// </summary>
long RecoveryAttempts,
/// <summary>
/// Most recent bind failure message (up to 256 chars); <c>null</c> if the listener
/// has never failed to bind.
/// </summary>
string? LastBindError,
/// <summary>
/// EWMA of recent backend round-trip times in milliseconds (α = 0.2).
/// Zero when no successful round-trips have been observed yet.
/// Stored internally as fixed-point microseconds (long * 1000) for Interlocked
/// compatibility; converted to double ms on snapshot.
/// </summary>
double LastRoundTripMs,
/// <summary>
/// Number of backend connections successfully established (Polly final success).
/// </summary>
long ConnectsSuccess,
/// <summary>
/// Number of backend connections that failed on all Polly attempts.
/// </summary>
long ConnectsFailed,
/// <summary>
/// Number of Modbus requests currently in flight on this PLC's multiplexed backend
/// connection (point-in-time snapshot of the correlation map size). Phase 9.
/// </summary>
long InFlightCount,
/// <summary>
/// Peak <see cref="InFlightCount"/> observed since the multiplexer was constructed.
/// Updated via <see cref="Interlocked"/> CAS so concurrent in-flight increments do not
/// lose the high-water mark. Phase 9.
/// </summary>
long MaxInFlight,
/// <summary>
/// Number of times the per-PLC TxId allocator's rolling cursor has wrapped
/// 0xFFFF → 0x0000. A non-zero value is benign; a sudden burst suggests extreme
/// in-flight churn. Phase 9.
/// </summary>
long TxIdWraps,
/// <summary>
/// Cumulative count of upstream pipes closed as a side effect of a backend disconnect.
/// Each backend reconnect cycle adds the number of attached upstream clients at the
/// time of the disconnect. Phase 9.
/// </summary>
long BackendDisconnectCascades,
/// <summary>
/// Current depth of the per-PLC outbound channel feeding the backend writer task
/// (frames queued, not yet on the wire). A sustained non-zero value indicates the
/// backend is slower than upstream demand. Phase 9.
/// </summary>
long BackendQueueDepth);
/// <summary>
/// Thread-safe per-PLC counters backed by <see cref="System.Threading.Interlocked"/> longs.
/// All increment methods are allocation-free (no boxing, no heap traffic on the hot path).
/// <see cref="Snapshot"/> may allocate (record construction) — it is off-path (status page only).
/// </summary>
internal sealed class ProxyCounters
{
// ── Hot-path fields (Interlocked longs) ─────────────────────────────────
private long _pdusForwarded;
private long _fc03;
private long _fc04;
private long _fc06;
private long _fc16;
private long _fcOther;
private long _rewrittenSlots;
private long _partialBcdWarnings;
private long _invalidBcdWarnings;
private long _backendException01;
private long _backendException02;
private long _backendException03;
private long _backendException04;
private long _backendExceptionOther;
private long _bytesUpstreamIn;
private long _bytesUpstreamOut;
private long _recoveryAttempts;
private long _connectsSuccess;
private long _connectsFailed;
// Phase 9 multiplexer telemetry.
private long _maxInFlight;
private long _backendDisconnectCascades;
// Phase 9: live state pulled from the multiplexer's allocator/map/queue on each
// snapshot. The multiplexer registers a single provider via SetMultiplexProvider.
// We use a volatile reference for lock-free read on the snapshot path.
private volatile IMultiplexCountersProvider? _multiplexProvider;
// LastBindError is a string (not a long); accessed via volatile field on ProxyCounters
// but actually stored on the supervisor. We expose it here for snapshot parity.
// Supervisor sets this via SetLastBindError; Snapshot reads it.
private volatile string? _lastBindError;
// EWMA round-trip: stored as fixed-point microseconds (value * 1000) so we can use
// Interlocked.CompareExchange on a long. The EWMA smoothing factor α = 0.2 gives a
// half-life of ~3 samples (responds quickly to changes without being noisy).
// Updated by PlcMultiplexer on each successful response (request→response round-trip,
// measured against InFlightRequest.SentAtUtc).
// 0 = no samples observed yet.
private long _lastRoundTripUsEwma; // fixed-point microseconds
// ── Increment methods ────────────────────────────────────────────────────
public void IncrementPdusForwarded()
=> Interlocked.Increment(ref _pdusForwarded);
public void IncrementFcCount(byte fc)
{
switch (fc)
{
case 0x03: Interlocked.Increment(ref _fc03); break;
case 0x04: Interlocked.Increment(ref _fc04); break;
case 0x06: Interlocked.Increment(ref _fc06); break;
case 0x10: Interlocked.Increment(ref _fc16); break;
default: Interlocked.Increment(ref _fcOther); break;
}
}
public void AddRewrittenSlots(int n)
=> Interlocked.Add(ref _rewrittenSlots, n);
public void IncrementPartialBcd()
=> Interlocked.Increment(ref _partialBcdWarnings);
public void IncrementInvalidBcd()
=> Interlocked.Increment(ref _invalidBcdWarnings);
/// <summary>
/// Increments the backend-exception counter for the given Modbus exception code.
/// Codes 14 map to individual counters; anything else goes to "Other".
/// </summary>
public void IncrementBackendException(byte code)
{
switch (code)
{
case 1: Interlocked.Increment(ref _backendException01); break;
case 2: Interlocked.Increment(ref _backendException02); break;
case 3: Interlocked.Increment(ref _backendException03); break;
case 4: Interlocked.Increment(ref _backendException04); break;
default: Interlocked.Increment(ref _backendExceptionOther); break;
}
}
/// <summary>
/// Adds byte counts for both upstream directions atomically.
/// </summary>
public void AddBytes(long up, long down)
{
Interlocked.Add(ref _bytesUpstreamIn, up);
Interlocked.Add(ref _bytesUpstreamOut, down);
}
/// <summary>
/// Records one successful backend TCP connect (Polly pipeline returned success).
/// </summary>
public void IncrementConnectSuccess()
=> Interlocked.Increment(ref _connectsSuccess);
/// <summary>
/// Records one failed backend TCP connect (all Polly attempts exhausted).
/// </summary>
public void IncrementConnectFailed()
=> Interlocked.Increment(ref _connectsFailed);
/// <summary>
/// Records <paramref name="n"/> upstream pipes closed by a backend disconnect cascade.
/// Phase 9.
/// </summary>
public void AddDisconnectCascades(int n)
=> Interlocked.Add(ref _backendDisconnectCascades, n);
/// <summary>
/// CAS-updates the peak in-flight high-water mark. Called on every successful
/// allocation by the multiplexer. Phase 9.
/// </summary>
public void ObserveInFlight(int currentInFlight)
{
long sample = currentInFlight;
long old;
do
{
old = Interlocked.Read(ref _maxInFlight);
if (sample <= old) return;
}
while (Interlocked.CompareExchange(ref _maxInFlight, sample, old) != old);
}
/// <summary>
/// Wires the live multiplexer telemetry source into this counter set. Called by
/// <see cref="Mbproxy.Proxy.Multiplexing.PlcMultiplexer"/> at construction time so
/// the status page's <see cref="Snapshot"/> can include live in-flight / queue-depth
/// values without polling the multiplexer separately. Phase 9.
/// </summary>
internal void SetMultiplexProvider(IMultiplexCountersProvider? provider)
=> _multiplexProvider = provider;
/// <summary>
/// Increments the recovery-attempt counter and records the bind error message
/// (truncated to 256 chars). Called by the supervisor on each failed bind.
/// </summary>
public void IncrementRecoveryAttempt(string errorMessage)
{
Interlocked.Increment(ref _recoveryAttempts);
_lastBindError = errorMessage.Length > 256 ? errorMessage[..256] : errorMessage;
}
/// <summary>
/// Clears the last bind error after a successful bind.
/// </summary>
public void ClearLastBindError()
{
_lastBindError = null;
}
/// <summary>
/// Updates the EWMA round-trip estimate with a new sample.
/// Uses α = 0.2: new_ewma = 0.2 * sample + 0.8 * old_ewma.
/// <paramref name="elapsedTicks"/> is from <see cref="System.Diagnostics.Stopwatch.GetTimestamp"/>.
/// Thread-safe via CAS loop on a fixed-point microsecond long.
/// </summary>
public void UpdateRoundTripEwma(long elapsedTicks)
{
// Convert ticks to microseconds.
double sampleMs = (double)elapsedTicks / System.Diagnostics.Stopwatch.Frequency * 1000.0;
// Fixed-point: store microseconds * 1000 (i.e. nanoseconds) as long for CAS.
// This gives ~1 µs resolution which is fine for Modbus round-trips (1100 ms range).
long sampleFixed = (long)(sampleMs * 1000.0);
long old, newVal;
do
{
old = Interlocked.Read(ref _lastRoundTripUsEwma);
// If no previous sample, seed with first sample; otherwise apply EWMA.
newVal = old == 0
? sampleFixed
: (long)(0.2 * sampleFixed + 0.8 * old);
}
while (Interlocked.CompareExchange(ref _lastRoundTripUsEwma, newVal, old) != old);
}
// ── Snapshot (off hot-path, may allocate) ────────────────────────────────
/// <summary>
/// Returns a point-in-time snapshot of all counters.
/// Each field is read atomically via <see cref="Interlocked.Read"/>.
/// May allocate (record construction); intended for the status-page path only.
/// </summary>
public CounterSnapshot Snapshot()
{
var provider = _multiplexProvider;
long inFlightNow = provider?.InFlightCount ?? 0;
long txWraps = provider?.TxIdWraps ?? 0;
long queueDepth = provider?.BackendQueueDepth ?? 0;
return new(
PdusForwarded: Interlocked.Read(ref _pdusForwarded),
Fc03: Interlocked.Read(ref _fc03),
Fc04: Interlocked.Read(ref _fc04),
Fc06: Interlocked.Read(ref _fc06),
Fc16: Interlocked.Read(ref _fc16),
FcOther: Interlocked.Read(ref _fcOther),
RewrittenSlots: Interlocked.Read(ref _rewrittenSlots),
PartialBcdWarnings: Interlocked.Read(ref _partialBcdWarnings),
InvalidBcdWarnings: Interlocked.Read(ref _invalidBcdWarnings),
BackendException01: Interlocked.Read(ref _backendException01),
BackendException02: Interlocked.Read(ref _backendException02),
BackendException03: Interlocked.Read(ref _backendException03),
BackendException04: Interlocked.Read(ref _backendException04),
BackendExceptionOther: Interlocked.Read(ref _backendExceptionOther),
BytesUpstreamIn: Interlocked.Read(ref _bytesUpstreamIn),
BytesUpstreamOut: Interlocked.Read(ref _bytesUpstreamOut),
RecoveryAttempts: Interlocked.Read(ref _recoveryAttempts),
LastBindError: _lastBindError,
LastRoundTripMs: Interlocked.Read(ref _lastRoundTripUsEwma) / 1000.0,
ConnectsSuccess: Interlocked.Read(ref _connectsSuccess),
ConnectsFailed: Interlocked.Read(ref _connectsFailed),
InFlightCount: inFlightNow,
MaxInFlight: Interlocked.Read(ref _maxInFlight),
TxIdWraps: txWraps,
BackendDisconnectCascades: Interlocked.Read(ref _backendDisconnectCascades),
BackendQueueDepth: queueDepth);
}
}
/// <summary>
/// Read-only window into the per-PLC multiplexer's live state (allocator counts,
/// outbound-queue depth). Implemented by <see cref="Mbproxy.Proxy.Multiplexing.PlcMultiplexer"/>
/// and registered with <see cref="ProxyCounters.SetMultiplexProvider"/> so
/// <see cref="ProxyCounters.Snapshot"/> can include live mux telemetry without holding
/// a direct reference to the multiplexer (which would couple counter snapshots to the
/// connection layer's lifecycle). Phase 9.
/// </summary>
internal interface IMultiplexCountersProvider
{
/// <summary>Number of currently-in-flight requests on the backend socket.</summary>
long InFlightCount { get; }
/// <summary>Cumulative 0xFFFF → 0x0000 wrap events from the TxId allocator.</summary>
long TxIdWraps { get; }
/// <summary>Current depth of the outbound channel (frames queued for the backend writer).</summary>
long BackendQueueDepth { get; }
}
+218
View File
@@ -0,0 +1,218 @@
using Mbproxy.Bcd;
using Mbproxy.Configuration;
using Mbproxy.Options;
using Mbproxy.Proxy.Multiplexing;
using Mbproxy.Proxy.Supervision;
using Microsoft.Extensions.Options;
using Polly;
namespace Mbproxy.Proxy;
/// <summary>
/// <see cref="BackgroundService"/> that owns all <see cref="PlcListenerSupervisor"/> instances.
///
/// Startup posture (matches design doc "eager, continue on per-port failure"):
/// <list type="number">
/// <item>Enumerate <see cref="MbproxyOptions.Plcs"/> and build one supervisor per PLC.</item>
/// <item>Start all supervisors in parallel. Each supervisor attempts to bind immediately
/// and enters the Polly recovery loop if the bind fails.</item>
/// <item>After all supervisors have completed their first bind attempt (reached
/// <see cref="SupervisorState.Bound"/> or <see cref="SupervisorState.Recovering"/>),
/// log <c>mbproxy.startup.ready</c> with bound/configured counts.</item>
/// </list>
///
/// Phase 06: passes the supervisor dictionary to <see cref="ConfigReconciler.Attach"/>
/// after initial startup so hot-reload changes are applied by the reconciler.
///
/// Stop: cancels all supervisors in parallel with a 5-second hard deadline.
/// </summary>
internal sealed partial class ProxyWorker : BackgroundService
{
private readonly IOptionsMonitor<MbproxyOptions> _options;
private readonly IPduPipeline _pipeline;
private readonly ILogger<ProxyWorker> _logger;
private readonly ILoggerFactory _loggerFactory;
private readonly ConfigReconciler _reconciler;
// Phase 06: supervisors are now managed jointly by ProxyWorker (initial bootstrap)
// and ConfigReconciler (subsequent hot-reload changes). The dictionary is shared
// via ConfigReconciler.Attach() after initial startup.
private readonly Dictionary<string, PlcListenerSupervisor> _supervisors = new(StringComparer.Ordinal);
/// <summary>
/// Read-only view of the live supervisor dictionary. Consumed by Phase 07's
/// <see cref="Admin.StatusSnapshotBuilder"/> to enumerate per-PLC state.
/// The caller should read this on the status-page path only (not the hot path).
/// </summary>
internal IReadOnlyDictionary<string, PlcListenerSupervisor> Supervisors => _supervisors;
public ProxyWorker(
IOptionsMonitor<MbproxyOptions> options,
IPduPipeline pipeline,
ILogger<ProxyWorker> logger,
ILoggerFactory loggerFactory,
ConfigReconciler reconciler)
{
_options = options;
_pipeline = pipeline;
_logger = logger;
_loggerFactory = loggerFactory;
_reconciler = reconciler;
}
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
{
var opts = _options.CurrentValue;
int plcsConfigured = opts.Plcs.Count;
// ── 1. Build per-PLC BCD tag maps ────────────────────────────────────────────
var plcContexts = new Dictionary<string, PerPlcContext>(opts.Plcs.Count, StringComparer.Ordinal);
foreach (var plc in opts.Plcs)
{
var result = BcdTagMapBuilder.Build(opts.BcdTags, plc.BcdTags);
foreach (var warn in result.Warnings)
_logger.LogWarning("[{Plc}] BCD tag map warning: {Message}", plc.Name, warn.Message);
if (result.Errors.Count > 0)
{
foreach (var err in result.Errors)
_logger.LogError("[{Plc}] BCD tag map error ({Kind}): {Message}",
plc.Name, err.Kind, err.Message);
_logger.LogError("Skipping listener for PLC '{Plc}' due to BCD tag map errors.", plc.Name);
continue;
}
plcContexts[plc.Name] = new PerPlcContext
{
PlcName = plc.Name,
TagMap = result.Map,
Counters = new ProxyCounters(),
Logger = _loggerFactory.CreateLogger($"Mbproxy.Proxy.BcdRewriter.{plc.Name}"),
};
}
// ── 2. Build Polly pipelines once ─────────────────────────────────────────────
// Both pipelines are built from ResilienceOptions and reused across all PLCs.
var resilienceOpts = opts.Resilience;
var backendPipeline = PolicyFactory.BuildBackendConnect(
resilienceOpts.BackendConnect,
_loggerFactory.CreateLogger("Mbproxy.Proxy.BackendConnect"));
// ── 3. Build supervisors ──────────────────────────────────────────────────────
foreach (var plc in opts.Plcs)
{
if (!plcContexts.TryGetValue(plc.Name, out var perPlcContext))
continue; // BCD map failed — skip this PLC.
// Each supervisor gets its own recovery pipeline (with its own logger scope).
var recoveryPipeline = PolicyFactory.BuildListenerRecovery(
resilienceOpts.ListenerRecovery,
_loggerFactory.CreateLogger($"Mbproxy.Proxy.ListenerRecovery.{plc.Name}"));
var supervisor = new PlcListenerSupervisor(
plc,
opts.Connection,
_pipeline,
_loggerFactory.CreateLogger<PlcListener>(),
_loggerFactory.CreateLogger<PlcMultiplexer>(),
_loggerFactory.CreateLogger($"Mbproxy.Proxy.UpstreamPipe.{plc.Name}"),
perPlcContext,
recoveryPipeline,
_loggerFactory.CreateLogger<PlcListenerSupervisor>(),
backendPipeline);
_supervisors[plc.Name] = supervisor;
}
// ── Phase 06: wire reconciler BEFORE starting supervisors ─────────────────
// Attach hands the reconciler the authoritative supervisor dictionary and the
// initial options snapshot. The reconciler won't process OnChange events until
// after this call — the brief window between Attach and first supervisor start
// is safe because the channel signal only enqueues; apply runs asynchronously.
_reconciler.Attach(_supervisors, opts);
if (_supervisors.Count == 0)
{
LogStartupReady(_logger, 0, plcsConfigured);
await Task.Delay(Timeout.Infinite, stoppingToken).ConfigureAwait(false);
return;
}
// ── 4. Start all supervisors in parallel ──────────────────────────────────────
var startTasks = _supervisors.Values
.Select(s => s.StartAsync(stoppingToken))
.ToArray();
await Task.WhenAll(startTasks).ConfigureAwait(false);
// ── 5. Wait for every supervisor to complete its first bind attempt ───────────
// "Ready" = every supervisor has transitioned out of Stopped (i.e. reached
// Bound or Recovering from its first attempt).
using var readyCts = new CancellationTokenSource(TimeSpan.FromSeconds(30));
using var readyLinked = CancellationTokenSource.CreateLinkedTokenSource(
readyCts.Token, stoppingToken);
var waitTasks = _supervisors.Values
.Select(s => s.WaitForInitialBindAttemptAsync(readyLinked.Token))
.ToArray();
try
{
await Task.WhenAll(waitTasks).ConfigureAwait(false);
}
catch (OperationCanceledException)
{
// Either the 30 s deadline fired or the service is stopping.
}
int boundCount = _supervisors.Values.Count(s => s.Snapshot().State == SupervisorState.Bound);
LogStartupReady(_logger, boundCount, plcsConfigured);
// ── 6. Keep the worker alive until the host signals stop ─────────────────────
// Supervisors run their own background loops; ExecuteAsync just waits.
await Task.Delay(Timeout.Infinite, stoppingToken).ConfigureAwait(false);
}
public override async Task StopAsync(CancellationToken cancellationToken)
{
// Cancel ExecuteAsync first.
await base.StopAsync(cancellationToken).ConfigureAwait(false);
// Stop all supervisors in parallel with a 5-second hard deadline.
using var stopCts = new CancellationTokenSource(TimeSpan.FromSeconds(5));
using var linked = CancellationTokenSource.CreateLinkedTokenSource(
stopCts.Token, cancellationToken);
var stopTasks = _supervisors.Values
.Select(s => s.StopAsync(linked.Token))
.ToArray();
try
{
await Task.WhenAll(stopTasks).ConfigureAwait(false);
}
catch
{
// Best effort — don't let individual supervisor failures block shutdown.
}
foreach (var supervisor in _supervisors.Values)
await supervisor.DisposeAsync().ConfigureAwait(false);
_supervisors.Clear();
}
// ── Logging ───────────────────────────────────────────────────────────────────────────
[LoggerMessage(EventId = 1, EventName = "mbproxy.startup.ready",
Level = LogLevel.Information,
Message = "mbproxy service ready — ListenersBound={ListenersBound} PlcsConfigured={PlcsConfigured}")]
private static partial void LogStartupReady(ILogger logger, int listenersBound, int plcsConfigured);
[LoggerMessage(EventId = 21, EventName = "mbproxy.startup.bind.failed",
Level = LogLevel.Error,
Message = "Failed to bind listener: Plc={Plc} Port={Port} Reason={Reason}")]
private static partial void LogBindFailed(ILogger logger, string plc, int port, string reason);
}
@@ -0,0 +1,56 @@
namespace Mbproxy.Proxy;
/// <summary>
/// Source-generated <see cref="LoggerMessage"/> definitions for the BCD rewriter pipeline.
/// All event names are stable — do not rename without updating docs/design.md.
/// </summary>
internal static partial class RewriterLogEvents
{
/// <summary>
/// Emitted when a 32-bit BCD pair is only partially covered by the read/write range.
/// The raw bytes are passed through unchanged; the client or PLC sees the original nibbles.
/// </summary>
[LoggerMessage(
EventId = 30,
EventName = "mbproxy.rewrite.partial_bcd",
Level = LogLevel.Warning,
Message = "Partial BCD overlap — passing through raw: Plc={PlcName} Address={Address} ClientStart={ClientStart} ClientQty={ClientQty}")]
public static partial void PartialBcd(
ILogger logger,
string plcName,
ushort address,
ushort clientStart,
ushort clientQty);
/// <summary>
/// Emitted when a register value at a configured BCD address contains a nibble >= 0xA
/// (i.e. not a valid BCD digit). The raw bytes are passed through unchanged.
/// Direction is "Read" (response from PLC) or "Write" (request from client).
/// </summary>
[LoggerMessage(
EventId = 31,
EventName = "mbproxy.rewrite.invalid_bcd",
Level = LogLevel.Warning,
Message = "Invalid BCD nibble — passing through raw: Plc={PlcName} Address={Address} RawValue=0x{RawValue:X4} Direction={Direction}")]
public static partial void InvalidBcd(
ILogger logger,
string plcName,
ushort address,
ushort rawValue,
string direction);
/// <summary>
/// Emitted when the PLC returns a Modbus exception response (high bit set on FC byte).
/// The frame is forwarded verbatim to the client.
/// </summary>
[LoggerMessage(
EventId = 32,
EventName = "mbproxy.exception.passthrough",
Level = LogLevel.Information,
Message = "Modbus exception forwarded: Plc={PlcName} Fc=0x{Fc:X2} ExceptionCode={ExceptionCode}")]
public static partial void ExceptionPassthrough(
ILogger logger,
string plcName,
byte fc,
byte exceptionCode);
}
@@ -0,0 +1,404 @@
using Mbproxy.Options;
using Mbproxy.Proxy.Multiplexing;
using Polly;
namespace Mbproxy.Proxy.Supervision;
/// <summary>
/// Wraps one <see cref="PlcListener"/> in a Polly-backed recovery loop.
///
/// <para><b>State machine</b>:
/// <list type="bullet">
/// <item><description><b>Bound</b>: listener is accepting connections; <see cref="PlcListener.RunAsync"/> is awaiting.</description></item>
/// <item><description><b>Recovering</b>: bind failed or RunAsync faulted; in Polly's delay window before the next attempt.</description></item>
/// <item><description><b>Stopped</b>: terminal. <see cref="StopAsync"/> was called; no further retries.</description></item>
/// </list>
/// </para>
///
/// <para><b>RecoveryAttempts</b>: the counter accumulates over the lifetime of the
/// supervisor. It is never reset after a successful re-bind so operators can see
/// "this listener has flapped N times since the service started." See also
/// <see cref="SupervisorSnapshot"/> doc comment.</para>
///
/// <para>The supervisor does NOT swallow exceptions from <see cref="PlcListener.RunAsync"/>
/// except <see cref="OperationCanceledException"/>. Every other fault is logged at Warning
/// with the exception message so operators can see WHY the listener was restarted.</para>
/// </summary>
internal sealed partial class PlcListenerSupervisor : IAsyncDisposable
{
private readonly PlcOptions _plc;
private readonly ConnectionOptions _connectionOptions;
private readonly IPduPipeline _pipeline;
private readonly ILogger<PlcListener> _listenerLogger;
private readonly ILogger<PlcMultiplexer> _multiplexerLogger;
private readonly ILogger _pipeLogger;
private readonly PerPlcContext? _perPlcContext;
private readonly ResiliencePipeline _recoveryPipeline;
private readonly ILogger<PlcListenerSupervisor> _logger;
private readonly ResiliencePipeline? _backendConnectPipeline;
// ── Mutable state ────────────────────────────────────────────────────────────────────
// Volatile so Snapshot() reads are coherent without locking.
private volatile SupervisorState _state = SupervisorState.Stopped;
private volatile string? _lastBindError;
private int _recoveryAttempts; // Interlocked
// Phase 07: current active listener for status-page pair enumeration.
private volatile PlcListener? _currentListener;
// Phase 06: _perPlcContext is now mutable so ReplaceContextAsync can swap it.
// Access from the accept loop (RunAsync) and from ReplaceContextAsync must be
// coherent; we use a volatile reference so the accept loop always reads the latest
// context without locking. The PlcListener created on each Polly attempt holds
// its own copy of the context at construction time; existing in-flight connections
// keep their old reference until they complete.
private volatile PerPlcContext? _currentContext;
/// <summary>
/// Per-supervisor CTS: cancelling it stops both the Polly delay and the inner
/// <see cref="PlcListener.RunAsync"/> loop.
/// </summary>
private CancellationTokenSource _supervisorCts = new();
private Task _supervisorTask = Task.CompletedTask;
private bool _disposed;
// ── Public surface ────────────────────────────────────────────────────────────────────
public string PlcName => _plc.Name;
public PlcListenerSupervisor(
PlcOptions plc,
ConnectionOptions connectionOptions,
IPduPipeline pipeline,
ILogger<PlcListener> listenerLogger,
ILogger<PlcMultiplexer> multiplexerLogger,
ILogger pipeLogger,
PerPlcContext? perPlcContext,
ResiliencePipeline recoveryPipeline,
ILogger<PlcListenerSupervisor> logger,
ResiliencePipeline? backendConnectPipeline = null)
{
_plc = plc;
_connectionOptions = connectionOptions;
_pipeline = pipeline;
_listenerLogger = listenerLogger;
_multiplexerLogger = multiplexerLogger;
_pipeLogger = pipeLogger;
_perPlcContext = perPlcContext;
_currentContext = perPlcContext; // Phase 06: live context slot
_recoveryPipeline = recoveryPipeline;
_logger = logger;
_backendConnectPipeline = backendConnectPipeline;
}
/// <summary>
/// Returns the current <see cref="ProxyCounters"/> for this PLC.
/// Used by <see cref="Configuration.ConfigReconciler"/> when building a reseat context
/// so that counters are preserved across a tag-map swap.
/// </summary>
public ProxyCounters CurrentCounters => _currentContext?.Counters ?? new ProxyCounters();
/// <summary>
/// Live collection of active <see cref="UpstreamPipe"/> instances attached to this
/// PLC's multiplexer. Returns an empty collection when the listener is not bound.
/// Consumed by Phase 07's status page (renamed from <c>ActivePairs</c> in Phase 9).
/// </summary>
public IReadOnlyCollection<UpstreamPipe> ActiveUpstreams
=> _currentListener?.ActiveUpstreams ?? Array.Empty<UpstreamPipe>();
/// <summary>
/// Launches the supervisor task. The task tries to bind immediately; if binding
/// fails it enters the Polly recovery loop. The method returns as soon as the
/// background task is started (it does NOT wait for the listener to reach
/// <see cref="SupervisorState.Bound"/>).
///
/// <para>Call <see cref="WaitForInitialBindAttemptAsync"/> after this to block until the
/// supervisor has transitioned out of <see cref="SupervisorState.Stopped"/>.</para>
/// </summary>
public Task StartAsync(CancellationToken ct)
{
_supervisorCts = CancellationTokenSource.CreateLinkedTokenSource(ct);
_supervisorTask = Task.Run(() => RunSupervisorAsync(_supervisorCts.Token), CancellationToken.None);
return Task.CompletedTask;
}
/// <summary>
/// Waits until the supervisor has completed its first bind attempt
/// (transitioned to <see cref="SupervisorState.Bound"/> or
/// <see cref="SupervisorState.Recovering"/>).
/// Returns immediately if the supervisor is already past that point.
/// </summary>
public async Task WaitForInitialBindAttemptAsync(CancellationToken ct)
{
while (_state == SupervisorState.Stopped && !ct.IsCancellationRequested
&& !_supervisorTask.IsCompleted)
{
await Task.Delay(10, ct).ConfigureAwait(false);
}
}
/// <summary>
/// Signals the supervisor to stop, cancels the current Polly delay (if in
/// <see cref="SupervisorState.Recovering"/>) or the <see cref="PlcListener.RunAsync"/>
/// loop (if in <see cref="SupervisorState.Bound"/>), and waits for the background
/// task to complete.
///
/// <para>Completes within ~1 s regardless of backoff window size because Polly's
/// <c>ExecuteAsync(ct)</c> honours the cancellation token.</para>
/// </summary>
public async Task StopAsync(CancellationToken ct)
{
_state = SupervisorState.Stopped;
await _supervisorCts.CancelAsync().ConfigureAwait(false);
try
{
await _supervisorTask.WaitAsync(ct).ConfigureAwait(false);
}
catch (OperationCanceledException)
{
// ct fired before the task completed — supervisor task will terminate
// asynchronously. Acceptable at shutdown.
}
catch (Exception)
{
// Supervisor task faulted — already logged inside RunSupervisorAsync.
}
}
/// <summary>Returns a point-in-time snapshot of this supervisor's state.</summary>
public SupervisorSnapshot Snapshot() => new(
State: _state,
LastBindError: _lastBindError,
RecoveryAttempts: Interlocked.CompareExchange(ref _recoveryAttempts, 0, 0));
/// <summary>
/// Atomically swaps the per-PLC context (tag map) without restarting the listener.
///
/// <para><b>Transition window</b>: there is a brief overlap where the old
/// <see cref="PlcListener"/> is running its accept loop with the old context while the
/// new context reference is being written. The volatile write ensures that the very
/// next <c>PlcListener</c> constructed inside the Polly loop (on any subsequent fault
/// recovery) picks up <paramref name="newCtx"/>. Existing in-flight upstream pipes
/// served by the current multiplexer keep their reference to the context captured at
/// multiplexer construction time; they finish on the old map. New connections after
/// this call use the new map. This is the correct design — partial-BCD rewrites
/// mid-request would be worse than a one-request gap.</para>
///
/// <para>This method is intentionally lightweight: it performs only the volatile write
/// and returns immediately. The <paramref name="ct"/> parameter is present for API
/// symmetry with start/stop and to accommodate future async expansion.</para>
/// </summary>
public Task ReplaceContextAsync(PerPlcContext newCtx, CancellationToken ct)
{
// Volatile write: the next PlcListener created in RunSupervisorAsync will see
// the new context. The accept loop itself does not hold a direct reference to
// _currentContext — it was captured at PlcListener construction time.
_currentContext = newCtx;
return Task.CompletedTask;
}
// ── Supervisor loop ───────────────────────────────────────────────────────────────────
private async Task RunSupervisorAsync(CancellationToken ct)
{
bool firstBind = true;
try
{
// The recovery pipeline wraps the entire try-bind-and-run block.
// When RunAsync returns or throws, the pipeline delays and retries.
// Cancellation of ct exits the pipeline with OperationCanceledException.
await _recoveryPipeline.ExecuteAsync(async token =>
{
// ── Instantiate a fresh listener ─────────────────────────────────
// A faulted listener's TcpListener socket must be disposed before
// re-binding. We create a new PlcListener on each attempt.
//
// Phase 06: use _currentContext (volatile) so that a ReplaceContextAsync
// call between Polly retry attempts is picked up here. Each listener
// captures the context at construction time; existing in-flight pairs
// keep their own reference. See ReplaceContextAsync for the transition
// window documentation.
var listener = new PlcListener(
_plc,
_connectionOptions,
_pipeline,
_listenerLogger,
_multiplexerLogger,
_pipeLogger,
_currentContext,
_backendConnectPipeline);
// Phase 07: expose the current listener for status-page pair enumeration.
_currentListener = listener;
try
{
// ── Bind ─────────────────────────────────────────────────────
listener.StartAsync();
}
catch (Exception bindEx)
{
// Dispose the listener before entering the recovery delay
// so the socket is released and the port can be reused.
_currentListener = null;
await listener.DisposeAsync().ConfigureAwait(false);
Interlocked.Increment(ref _recoveryAttempts);
string reason = bindEx.Message;
string truncated = reason.Length > 256 ? reason[..256] : reason;
_lastBindError = truncated;
_state = SupervisorState.Recovering;
// Also update the per-PLC counters if available (Phase 07 reads these).
_currentContext?.Counters.IncrementRecoveryAttempt(truncated);
LogBindFailed(_logger, _plc.Name, _plc.ListenPort, truncated);
// Re-throw so the Polly pipeline can delay and retry.
throw;
}
// ── Bind succeeded ───────────────────────────────────────────────
if (firstBind)
{
firstBind = false;
LogBound(_logger, _plc.Name, _plc.ListenPort);
}
else
{
// Re-bind after a recovery — emit the "recovered" event once.
int totalAttempts = Interlocked.CompareExchange(ref _recoveryAttempts, 0, 0);
LogListenerRecovered(_logger, _plc.Name, _plc.ListenPort, totalAttempts);
}
// Clear the last bind error on a successful bind.
_lastBindError = null;
_currentContext?.Counters.ClearLastBindError();
_state = SupervisorState.Bound;
// ── Run the accept loop ──────────────────────────────────────────
// RunAsync returns when: (a) token is cancelled (normal shutdown),
// (b) the listener faults (OS reclaims port, transient network reset).
// In both cases we fall through to the Polly retry handler.
try
{
await listener.RunAsync(token).ConfigureAwait(false);
}
catch (OperationCanceledException)
{
// Normal shutdown path — do not enter recovery loop.
_currentListener = null;
await listener.DisposeAsync().ConfigureAwait(false);
throw; // Propagate to exit the Polly pipeline.
}
catch (Exception runEx)
{
// Listener faulted at runtime (port stolen, OS network reset, etc.).
// Log at Warning — operators must see WHY the listener was restarted.
LogListenerFaulted(_logger, _plc.Name, _plc.ListenPort, runEx, runEx.Message);
_currentListener = null;
await listener.DisposeAsync().ConfigureAwait(false);
Interlocked.Increment(ref _recoveryAttempts);
string truncated = runEx.Message.Length > 256 ? runEx.Message[..256] : runEx.Message;
_lastBindError = truncated;
_state = SupervisorState.Recovering;
// Also update the per-PLC counters if available.
_currentContext?.Counters.IncrementRecoveryAttempt(truncated);
// Re-throw so Polly can delay and retry.
throw;
}
// RunAsync returned normally (token was cancelled or listener closed).
// If we got here without an exception, the loop ended cleanly.
_currentListener = null;
await listener.DisposeAsync().ConfigureAwait(false);
// If cancellation is requested, throw so Polly exits cleanly.
token.ThrowIfCancellationRequested();
// Otherwise (listener closed without cancellation — e.g., OS event),
// treat as a fault and re-enter recovery.
Interlocked.Increment(ref _recoveryAttempts);
const string unexpectedEnd = "Listener accept loop ended unexpectedly";
_lastBindError = unexpectedEnd;
_state = SupervisorState.Recovering;
_currentContext?.Counters.IncrementRecoveryAttempt(unexpectedEnd);
LogListenerEnded(_logger, _plc.Name, _plc.ListenPort);
throw new InvalidOperationException(unexpectedEnd);
}, ct).ConfigureAwait(false);
}
catch (OperationCanceledException)
{
// Normal: StopAsync cancelled the token.
}
catch (Exception ex)
{
// Polly pipeline exhausted (should not happen for listener recovery since
// MaxRetryAttempts = int.MaxValue) or an unexpected fault.
_logger.LogError(ex, "Supervisor for Plc={Plc} exited unexpectedly: {Message}",
_plc.Name, ex.Message);
}
finally
{
_state = SupervisorState.Stopped;
_currentListener = null;
}
}
// ── IAsyncDisposable ─────────────────────────────────────────────────────────────────
public async ValueTask DisposeAsync()
{
if (_disposed) return;
_disposed = true;
using var stopCts = new CancellationTokenSource(TimeSpan.FromSeconds(5));
try
{
await StopAsync(stopCts.Token).ConfigureAwait(false);
}
catch
{
// Best-effort cleanup.
}
_supervisorCts.Dispose();
}
// ── Logging ───────────────────────────────────────────────────────────────────────────
[LoggerMessage(EventId = 40, EventName = "mbproxy.startup.bind",
Level = LogLevel.Information,
Message = "Listener bound: Plc={Plc} Port={Port}")]
private static partial void LogBound(ILogger logger, string plc, int port);
[LoggerMessage(EventId = 41, EventName = "mbproxy.startup.bind.failed",
Level = LogLevel.Error,
Message = "Failed to bind listener: Plc={Plc} Port={Port} Reason={Reason}")]
private static partial void LogBindFailed(ILogger logger, string plc, int port, string reason);
[LoggerMessage(EventId = 42, EventName = "mbproxy.listener.recovered",
Level = LogLevel.Information,
Message = "Listener recovered: Plc={Plc} Port={Port} AttemptCount={AttemptCount}")]
private static partial void LogListenerRecovered(ILogger logger, string plc, int port, int attemptCount);
[LoggerMessage(EventId = 43, EventName = "mbproxy.listener.faulted",
Level = LogLevel.Warning,
Message = "Listener faulted (will recover): Plc={Plc} Port={Port} Reason={Reason}")]
private static partial void LogListenerFaulted(ILogger logger, string plc, int port, Exception ex, string reason);
[LoggerMessage(EventId = 44, EventName = "mbproxy.listener.ended",
Level = LogLevel.Warning,
Message = "Listener accept loop ended unexpectedly (will recover): Plc={Plc} Port={Port}")]
private static partial void LogListenerEnded(ILogger logger, string plc, int port);
}
@@ -0,0 +1,125 @@
using System.Net.Sockets;
using Mbproxy.Options;
using Polly;
using Polly.Retry;
namespace Mbproxy.Proxy.Supervision;
/// <summary>
/// Builds Polly v8 <see cref="ResiliencePipeline"/> instances from the typed resilience
/// configuration (<see cref="RetryProfile"/> and <see cref="RecoveryProfile"/>).
///
/// <para>Pipelines are built once at startup and reused across all operations. They are
/// thread-safe and allocation-free on the happy path.</para>
/// </summary>
internal static class PolicyFactory
{
// ── Network errors that are safe to retry on backend connect ────────────────────────
// Only these SocketError values are transient; everything else is a programming error
// or a configuration mistake and should not be retried.
private static readonly HashSet<SocketError> RetryableSocketErrors =
[
SocketError.ConnectionRefused,
SocketError.TimedOut,
SocketError.HostUnreachable,
SocketError.NetworkUnreachable,
];
/// <summary>
/// Builds a retry pipeline for backend (PLC) TCP connect attempts.
///
/// <para>Retries only on <see cref="SocketException"/> with a
/// <see cref="SocketError"/> in <see cref="RetryableSocketErrors"/>. Does NOT retry
/// <see cref="ArgumentException"/>, <see cref="OperationCanceledException"/>, or any
/// non-network exception.</para>
///
/// <para>The delay sequence is taken directly from <see cref="RetryProfile.BackoffMs"/>;
/// element [i] is the delay before attempt i+1 (0-based). If the attempt index
/// exceeds the array, the last element is used.</para>
///
/// <para>After all attempts are exhausted, the pipeline re-throws the last exception
/// so the caller can log <c>mbproxy.backend.failed</c> and close the upstream socket.</para>
/// </summary>
public static ResiliencePipeline BuildBackendConnect(RetryProfile profile, ILogger logger)
{
// MaxAttempts in Polly v8 includes the first attempt.
int maxAttempts = Math.Max(1, profile.MaxAttempts);
var backoffMs = profile.BackoffMs;
return new ResiliencePipelineBuilder()
.AddRetry(new RetryStrategyOptions
{
MaxRetryAttempts = maxAttempts - 1, // retries = total - 1 (first attempt is free)
ShouldHandle = new PredicateBuilder()
.Handle<SocketException>(ex => RetryableSocketErrors.Contains(ex.SocketErrorCode)),
DelayGenerator = args =>
{
int idx = args.AttemptNumber; // 0 = first retry, i.e. after attempt 0
// Clamp to the last element if we exceed the array.
int ms = backoffMs.Count > 0
? backoffMs[Math.Min(idx, backoffMs.Count - 1)]
: 0;
return new ValueTask<TimeSpan?>(TimeSpan.FromMilliseconds(ms));
},
OnRetry = args =>
{
logger.LogDebug(
"Backend connect retry {Attempt}/{Max}: {Error}",
args.AttemptNumber + 1,
maxAttempts - 1,
args.Outcome.Exception?.Message);
return ValueTask.CompletedTask;
},
})
.Build();
}
/// <summary>
/// Builds an infinite-retry pipeline for listener bind recovery.
///
/// <para>The delay sequence is:
/// <list type="bullet">
/// <item><description>Attempts 0 .. (InitialBackoffMs.Length-1) use the initial backoff array.</description></item>
/// <item><description>All subsequent attempts use <see cref="RecoveryProfile.SteadyStateMs"/>.</description></item>
/// </list>
/// The pipeline never exhausts — it retries until the supervisor's cancellation token
/// fires (on <see cref="PlcListenerSupervisor.StopAsync"/>).</para>
///
/// <para>Polly's <c>ExecuteAsync(ct)</c> propagates <see cref="OperationCanceledException"/>
/// when <paramref name="ct"/> fires, so the supervisor exits the loop cleanly.</para>
/// </summary>
public static ResiliencePipeline BuildListenerRecovery(RecoveryProfile profile, ILogger logger)
{
var initialMs = profile.InitialBackoffMs;
int steadyMs = profile.SteadyStateMs;
return new ResiliencePipelineBuilder()
.AddRetry(new RetryStrategyOptions
{
// int.MaxValue makes the pipeline retry indefinitely; cancellation
// is the only exit path (besides the supervisor calling StopAsync).
MaxRetryAttempts = int.MaxValue,
ShouldHandle = new PredicateBuilder().Handle<Exception>(
ex => ex is not OperationCanceledException),
DelayGenerator = args =>
{
// args.AttemptNumber is the zero-based index of the retry
// (0 = first retry, after the first failed attempt).
int idx = args.AttemptNumber;
int ms = idx < initialMs.Count
? initialMs[idx]
: steadyMs;
return new ValueTask<TimeSpan?>(TimeSpan.FromMilliseconds(ms));
},
OnRetry = args =>
{
logger.LogDebug(
"Listener recovery attempt {Attempt}: {Error}",
args.AttemptNumber + 1,
args.Outcome.Exception?.Message);
return ValueTask.CompletedTask;
},
})
.Build();
}
}
@@ -0,0 +1,50 @@
namespace Mbproxy.Proxy.Supervision;
/// <summary>
/// State machine states for <see cref="PlcListenerSupervisor"/>.
/// </summary>
public enum SupervisorState
{
/// <summary>
/// The listener is bound and its accept loop is running.
/// Entry conditions: <see cref="PlcListener.StartAsync"/> succeeded (on first attempt or
/// after a recovery attempt).
/// </summary>
Bound,
/// <summary>
/// The listener is not bound; the supervisor is waiting for the next Polly retry delay
/// before reattempting. Entered after any failed bind (at startup or at runtime).
/// </summary>
Recovering,
/// <summary>
/// Terminal state. <see cref="PlcListenerSupervisor.StopAsync"/> was called; the supervisor
/// task has been cancelled and will not retry.
/// </summary>
Stopped,
}
/// <summary>
/// Immutable point-in-time snapshot of a supervisor's state. Consumed by Phase 07's
/// status page via <see cref="PlcListenerSupervisor.Snapshot"/>.
///
/// <para><b>RecoveryAttempts semantics</b>: this counter <em>accumulates over the lifetime
/// of the supervisor</em> and is never reset. Operators reading the status page should
/// interpret it as "how many times has this listener faulted or failed to bind since
/// the service started" — useful for detecting port-flapping or repeated OS network
/// resets. Phase 07 surfaces it as-is.</para>
/// </summary>
/// <param name="State">Current state of the supervisor.</param>
/// <param name="LastBindError">
/// Most recent bind failure message (up to 256 chars). <c>null</c> if the listener
/// has never failed to bind.
/// </param>
/// <param name="RecoveryAttempts">
/// Total number of failed bind attempts over the lifetime of this supervisor.
/// Accumulates; never resets to 0.
/// </param>
public sealed record SupervisorSnapshot(
SupervisorState State,
string? LastBindError,
int RecoveryAttempts);