mbproxy: initial commit through Phase 9 (TxId multiplexing)
Adds the mbproxy service end-to-end. Phases 00-08 implement the production-ready single-listener / 1:1-backend transparent Modbus TCP proxy with bidirectional BCD rewriting for the ~54-PLC DL205/DL260 fleet. Phase 9 replaces the connection layer with a single backend socket per PLC plus MBAP TxId rewriting, lifting the H2-ECOM100's 4-concurrent-client cap as an operational ceiling. Phase 9 additions of note: - PlcMultiplexer + UpstreamPipe + TxIdAllocator + CorrelationMap - InFlightRequest with IReadOnlyList<InterestedParty> (load-bearing for Phase 10 read coalescing — do not collapse to a single field) - Per-request watchdog: surfaces Modbus exception 0x0B to upstream on BackendRequestTimeoutMs, defending against lost responses, dead-PLC paths, and pymodbus 3.13.0's concurrent-multiplexed- request bug (its ServerRequestHandler.last_pdu state race) - Status DTO + HTML gain inFlight / maxInFlight / txIdWraps / disconnectCascades / queueDepth (Tier 1.6 in docs/kpi.md) Tests: 263 unit + 38 E2E. Multiplexer correctness under truly concurrent backend traffic is proved against a stub backend in PlcMultiplexerTests; MultiplexerE2ETests paces requests so pymodbus 3.13's single-PDU framer stays in known-good mode. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,460 @@
|
||||
using Mbproxy.Bcd;
|
||||
|
||||
namespace Mbproxy.Proxy;
|
||||
|
||||
/// <summary>
|
||||
/// BCD-rewriting PDU pipeline. Registered as the singleton <see cref="IPduPipeline"/>
|
||||
/// in production (replaces <see cref="NoopPduPipeline"/> from Phase 03).
|
||||
///
|
||||
/// FC scope (per design.md):
|
||||
/// FC03 / FC04 response — decode covered BCD slots from raw nibbles → binary integer.
|
||||
/// FC06 request — encode binary integer → BCD nibbles.
|
||||
/// FC16 request — per-register over the configured slots.
|
||||
/// All other FCs — pass through byte-for-byte.
|
||||
///
|
||||
/// MBAP transparency contract: the MBAP length field is NEVER modified. Re-encoded slots
|
||||
/// are the same byte width as the originals (ushort → ushort), so the PDU length is stable.
|
||||
///
|
||||
/// <para><b>Phase 9 — request correlation:</b> FC03/FC04 responses do not carry the
|
||||
/// original start address. The multiplexer builds an <see cref="Multiplexing.InFlightRequest"/>
|
||||
/// on the request path, stores it in its <see cref="Multiplexing.CorrelationMap"/>, and
|
||||
/// attaches it to the per-call <see cref="PerPlcContext.CurrentRequest"/> on the response
|
||||
/// path. The rewriter consumes <c>CurrentRequest</c> instead of a per-pair last-request
|
||||
/// slot, so concurrent responses from different upstream clients each decode against
|
||||
/// their own request range without cross-talk.</para>
|
||||
///
|
||||
/// <para>This class is stateless. All per-call state arrives via <see cref="PduContext"/>
|
||||
/// (specifically <see cref="PerPlcContext.CurrentRequest"/> on response). It is safe to
|
||||
/// call concurrently from multiple upstream-read tasks and the single backend reader task.</para>
|
||||
/// </summary>
|
||||
internal sealed class BcdPduPipeline : IPduPipeline
|
||||
{
|
||||
// ── IPduPipeline.Process ─────────────────────────────────────────────────
|
||||
|
||||
public void Process(
|
||||
MbapDirection direction,
|
||||
ReadOnlySpan<byte> mbapHeader,
|
||||
Span<byte> pdu,
|
||||
PduContext context)
|
||||
{
|
||||
// PerPlcContext carries the BCD map, counters, and logger.
|
||||
// If the caller passes a plain PduContext (e.g. in unit tests using NoopPduPipeline
|
||||
// alongside this one), we skip BCD processing gracefully.
|
||||
if (context is not PerPlcContext ctx)
|
||||
return;
|
||||
|
||||
if (pdu.Length < 1)
|
||||
return;
|
||||
|
||||
byte fc = pdu[0];
|
||||
ctx.Counters.IncrementPdusForwarded();
|
||||
ctx.Counters.IncrementFcCount(fc);
|
||||
|
||||
if (direction == MbapDirection.RequestToBackend)
|
||||
{
|
||||
ProcessRequest(fc, pdu, ctx);
|
||||
}
|
||||
else
|
||||
{
|
||||
ProcessResponse(fc, pdu, ctx);
|
||||
}
|
||||
}
|
||||
|
||||
// ── Request processing (FC06 / FC16) ────────────────────────────────────
|
||||
|
||||
private static void ProcessRequest(byte fc, Span<byte> pdu, PerPlcContext ctx)
|
||||
{
|
||||
switch (fc)
|
||||
{
|
||||
case 0x06:
|
||||
ProcessFc06Request(pdu, ctx);
|
||||
break;
|
||||
|
||||
case 0x10:
|
||||
ProcessFc16Request(pdu, ctx);
|
||||
break;
|
||||
|
||||
// All other FCs: transparent pass-through.
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// FC06 Write Single Register request: [fc=06][addrHi][addrLo][valHi][valLo]
|
||||
/// If the address is a configured 16-bit BCD tag, encode the client's binary integer
|
||||
/// as BCD nibbles before forwarding to the PLC.
|
||||
/// Partial-overlap (address is part of a 32-bit pair): warn + pass through raw.
|
||||
/// </summary>
|
||||
private static void ProcessFc06Request(Span<byte> pdu, PerPlcContext ctx)
|
||||
{
|
||||
if (pdu.Length < 5)
|
||||
return;
|
||||
|
||||
ushort address = (ushort)((pdu[1] << 8) | pdu[2]);
|
||||
ushort value = (ushort)((pdu[3] << 8) | pdu[4]);
|
||||
|
||||
// Direct point lookup at the exact address.
|
||||
if (!ctx.TagMap.TryGet(address, out var tag))
|
||||
{
|
||||
// Not a BCD address — but check whether this address is the HIGH register
|
||||
// of a 32-bit pair (Address+1 where Address is configured as 32-bit).
|
||||
// TryGetForRange with qty=1 will catch that partial-overlap case.
|
||||
if (ctx.TagMap.TryGetForRange(address, 1, out var hits) && hits.Count > 0)
|
||||
{
|
||||
// The only hit should be a 32-bit tag whose high register is at `address`.
|
||||
foreach (var hit in hits)
|
||||
{
|
||||
if (hit.Tag.IsThirtyTwoBit && hit.OffsetWords < 0)
|
||||
{
|
||||
// This address is the high register of the 32-bit pair.
|
||||
RewriterLogEvents.PartialBcd(ctx.Logger, ctx.PlcName, address, address, 1);
|
||||
ctx.Counters.IncrementPartialBcd();
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (tag.IsThirtyTwoBit)
|
||||
{
|
||||
// FC06 writes exactly one register. If this is the LOW address of a 32-bit tag,
|
||||
// that's a partial write. Per design partial-overlap policy: warn + pass through.
|
||||
RewriterLogEvents.PartialBcd(ctx.Logger, ctx.PlcName, address, address, 1);
|
||||
ctx.Counters.IncrementPartialBcd();
|
||||
return;
|
||||
}
|
||||
|
||||
// 16-bit tag: encode client's binary integer as BCD nibbles.
|
||||
ushort encoded;
|
||||
try
|
||||
{
|
||||
encoded = BcdCodec.Encode16(value);
|
||||
}
|
||||
catch (ArgumentOutOfRangeException)
|
||||
{
|
||||
// Value is outside [0, 9999] — cannot represent as 4-digit BCD.
|
||||
RewriterLogEvents.InvalidBcd(ctx.Logger, ctx.PlcName, address, value, "Write");
|
||||
ctx.Counters.IncrementInvalidBcd();
|
||||
return; // pass through raw
|
||||
}
|
||||
|
||||
pdu[3] = (byte)(encoded >> 8);
|
||||
pdu[4] = (byte)(encoded & 0xFF);
|
||||
ctx.Counters.AddRewrittenSlots(1);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// FC16 Write Multiple Registers request:
|
||||
/// [fc=10][startHi][startLo][qtyHi][qtyLo][byteCount][reg0Hi][reg0Lo]...
|
||||
/// Re-encodes binary integers at configured BCD addresses to BCD nibbles.
|
||||
/// </summary>
|
||||
private static void ProcessFc16Request(Span<byte> pdu, PerPlcContext ctx)
|
||||
{
|
||||
// Minimum FC16 request PDU: fc(1) + start(2) + qty(2) + byteCount(1) = 6 bytes.
|
||||
if (pdu.Length < 6)
|
||||
return;
|
||||
|
||||
ushort startAddress = (ushort)((pdu[1] << 8) | pdu[2]);
|
||||
ushort qty = (ushort)((pdu[3] << 8) | pdu[4]);
|
||||
// byte byteCount = pdu[5]; (qty * 2, not used directly)
|
||||
|
||||
if (!ctx.TagMap.TryGetForRange(startAddress, qty, out var hits))
|
||||
return; // no BCD tags in this range
|
||||
|
||||
int dataOffset = 6; // pdu[6..] = register data, 2 bytes per register
|
||||
|
||||
foreach (var hit in hits)
|
||||
{
|
||||
int offsetWords = hit.OffsetWords;
|
||||
var tag = hit.Tag;
|
||||
|
||||
if (tag.IsThirtyTwoBit)
|
||||
{
|
||||
// Full 32-bit pair fits if both low (offsetWords) and high (offsetWords+1)
|
||||
// are within the [0, qty) range.
|
||||
bool lowInRange = offsetWords >= 0 && offsetWords < qty;
|
||||
bool highInRange = (offsetWords + 1) >= 0 && (offsetWords + 1) < qty;
|
||||
|
||||
if (!lowInRange || !highInRange)
|
||||
{
|
||||
// Partial overlap — one of the two registers is outside the write range.
|
||||
RewriterLogEvents.PartialBcd(ctx.Logger, ctx.PlcName,
|
||||
tag.Address, startAddress, qty);
|
||||
ctx.Counters.IncrementPartialBcd();
|
||||
continue;
|
||||
}
|
||||
|
||||
// Both registers are in range. Read the low/high words from the PDU.
|
||||
int lowByteOff = dataOffset + offsetWords * 2;
|
||||
int highByteOff = dataOffset + (offsetWords + 1) * 2;
|
||||
|
||||
if (lowByteOff + 2 > pdu.Length || highByteOff + 2 > pdu.Length)
|
||||
continue; // malformed PDU — skip safely
|
||||
|
||||
// Per CDAB layout:
|
||||
// pdu[lowByteOff..+2] = low register (low 4 BCD digits of value)
|
||||
// pdu[highByteOff..+2] = high register (high 4 BCD digits of value)
|
||||
// The client sends binary integers; encode to BCD nibbles.
|
||||
//
|
||||
// Design note: for a 32-bit write the client sends a 32-bit binary value
|
||||
// split across two registers in CDAB order (low word at Address,
|
||||
// high word at Address+1). We reconstruct the int and encode it.
|
||||
ushort clientLow = (ushort)((pdu[lowByteOff] << 8) | pdu[lowByteOff + 1]);
|
||||
ushort clientHigh = (ushort)((pdu[highByteOff] << 8) | pdu[highByteOff + 1]);
|
||||
|
||||
// Reconstruct the 32-bit binary value (CDAB: low-word = low digits).
|
||||
int binaryValue = clientHigh * 10_000 + clientLow;
|
||||
|
||||
ushort bcdLow, bcdHigh;
|
||||
try
|
||||
{
|
||||
(bcdLow, bcdHigh) = BcdCodec.Encode32(binaryValue);
|
||||
}
|
||||
catch (ArgumentOutOfRangeException)
|
||||
{
|
||||
RewriterLogEvents.InvalidBcd(ctx.Logger, ctx.PlcName, tag.Address,
|
||||
clientLow, "Write");
|
||||
ctx.Counters.IncrementInvalidBcd();
|
||||
continue;
|
||||
}
|
||||
|
||||
pdu[lowByteOff] = (byte)(bcdLow >> 8);
|
||||
pdu[lowByteOff + 1] = (byte)(bcdLow & 0xFF);
|
||||
pdu[highByteOff] = (byte)(bcdHigh >> 8);
|
||||
pdu[highByteOff + 1] = (byte)(bcdHigh & 0xFF);
|
||||
ctx.Counters.AddRewrittenSlots(2);
|
||||
}
|
||||
else
|
||||
{
|
||||
// 16-bit tag.
|
||||
if (offsetWords < 0 || offsetWords >= qty)
|
||||
continue; // outside range (shouldn't happen for 16-bit but be defensive)
|
||||
|
||||
int byteOff = dataOffset + offsetWords * 2;
|
||||
if (byteOff + 2 > pdu.Length)
|
||||
continue;
|
||||
|
||||
ushort clientValue = (ushort)((pdu[byteOff] << 8) | pdu[byteOff + 1]);
|
||||
|
||||
ushort encoded;
|
||||
try
|
||||
{
|
||||
encoded = BcdCodec.Encode16(clientValue);
|
||||
}
|
||||
catch (ArgumentOutOfRangeException)
|
||||
{
|
||||
RewriterLogEvents.InvalidBcd(ctx.Logger, ctx.PlcName, tag.Address,
|
||||
clientValue, "Write");
|
||||
ctx.Counters.IncrementInvalidBcd();
|
||||
continue;
|
||||
}
|
||||
|
||||
pdu[byteOff] = (byte)(encoded >> 8);
|
||||
pdu[byteOff + 1] = (byte)(encoded & 0xFF);
|
||||
ctx.Counters.AddRewrittenSlots(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ── Response processing (FC03 / FC04) ───────────────────────────────────
|
||||
|
||||
private static void ProcessResponse(byte fc, Span<byte> pdu, PerPlcContext ctx)
|
||||
{
|
||||
// Check for Modbus exception response (high bit of FC is set).
|
||||
if ((fc & 0x80) != 0)
|
||||
{
|
||||
// Exception response: [fc|0x80][exceptionCode]
|
||||
byte originalFc = (byte)(fc & 0x7F);
|
||||
byte exceptionCode = pdu.Length >= 2 ? pdu[1] : (byte)0;
|
||||
|
||||
RewriterLogEvents.ExceptionPassthrough(ctx.Logger, ctx.PlcName, originalFc, exceptionCode);
|
||||
ctx.Counters.IncrementBackendException(exceptionCode);
|
||||
return; // pass through raw
|
||||
}
|
||||
|
||||
switch (fc)
|
||||
{
|
||||
case 0x03:
|
||||
case 0x04:
|
||||
// Handled below.
|
||||
break;
|
||||
|
||||
case 0x06:
|
||||
// FC06 response echoes [fc][addrHi][addrLo][valHi][valLo].
|
||||
// Since the proxy re-encoded the request (binary→BCD), the PLC echoes back
|
||||
// BCD nibbles. The client expects its original binary value. Decode here.
|
||||
ProcessFc06Response(pdu, ctx);
|
||||
return;
|
||||
|
||||
case 0x10:
|
||||
// FC16 response: [fc][startHi][startLo][qtyHi][qtyLo] — no register data.
|
||||
return;
|
||||
|
||||
default:
|
||||
return; // all other FCs pass through
|
||||
}
|
||||
|
||||
// FC03/04 response: [fc][byteCount][reg0Hi][reg0Lo]...
|
||||
// The start address is NOT in the response — the multiplexer attaches the matched
|
||||
// InFlightRequest to ctx.CurrentRequest on the response path. Without it (e.g., a
|
||||
// unit-test fixture invoking the pipeline directly without correlation) we cannot
|
||||
// decode safely; pass the bytes through.
|
||||
var currentReq = ctx.CurrentRequest;
|
||||
if (currentReq is null)
|
||||
return;
|
||||
|
||||
// Only FC03/04 responses should consult start/qty.
|
||||
if (currentReq.Fc != 0x03 && currentReq.Fc != 0x04)
|
||||
return;
|
||||
|
||||
ushort startAddress = currentReq.StartAddress;
|
||||
ushort qty = currentReq.Qty;
|
||||
|
||||
if (pdu.Length < 2)
|
||||
return;
|
||||
|
||||
int byteCount = pdu[1];
|
||||
int wordsInResponse = byteCount / 2;
|
||||
|
||||
// Sanity: the qty in the request should match the words in the response.
|
||||
// Use the smaller of the two to stay in bounds.
|
||||
ushort effectiveQty = (ushort)Math.Min(qty, wordsInResponse);
|
||||
|
||||
if (!ctx.TagMap.TryGetForRange(startAddress, effectiveQty, out var hits))
|
||||
return;
|
||||
|
||||
int dataOffset = 2; // pdu[2..] = register data
|
||||
|
||||
foreach (var hit in hits)
|
||||
{
|
||||
int offsetWords = hit.OffsetWords;
|
||||
var tag = hit.Tag;
|
||||
|
||||
if (tag.IsThirtyTwoBit)
|
||||
{
|
||||
bool lowInRange = offsetWords >= 0 && offsetWords < effectiveQty;
|
||||
bool highInRange = (offsetWords + 1) >= 0 && (offsetWords + 1) < effectiveQty;
|
||||
|
||||
if (!lowInRange || !highInRange)
|
||||
{
|
||||
RewriterLogEvents.PartialBcd(ctx.Logger, ctx.PlcName,
|
||||
tag.Address, startAddress, qty);
|
||||
ctx.Counters.IncrementPartialBcd();
|
||||
continue;
|
||||
}
|
||||
|
||||
int lowByteOff = dataOffset + offsetWords * 2;
|
||||
int highByteOff = dataOffset + (offsetWords + 1) * 2;
|
||||
|
||||
if (lowByteOff + 2 > pdu.Length || highByteOff + 2 > pdu.Length)
|
||||
continue;
|
||||
|
||||
// CDAB: Address = low register (low 4 BCD digits), Address+1 = high register
|
||||
ushort rawLow = (ushort)((pdu[lowByteOff] << 8) | pdu[lowByteOff + 1]);
|
||||
ushort rawHigh = (ushort)((pdu[highByteOff] << 8) | pdu[highByteOff + 1]);
|
||||
|
||||
int decoded;
|
||||
try
|
||||
{
|
||||
decoded = BcdCodec.Decode32(rawLow, rawHigh);
|
||||
}
|
||||
catch (FormatException)
|
||||
{
|
||||
// Emit invalid_bcd for the low register (first bad word we'd encounter).
|
||||
ushort badRaw = HasBadNibble(rawLow) ? rawLow : rawHigh;
|
||||
ushort badAddr = HasBadNibble(rawLow) ? tag.Address : tag.HighRegister;
|
||||
RewriterLogEvents.InvalidBcd(ctx.Logger, ctx.PlcName, badAddr, badRaw, "Read");
|
||||
ctx.Counters.IncrementInvalidBcd();
|
||||
continue;
|
||||
}
|
||||
|
||||
// Write decoded binary value back as a 32-bit value in CDAB layout.
|
||||
// The client receives low 4 digits at Address and high 4 digits at Address+1.
|
||||
int decodedLow = decoded % 10_000;
|
||||
int decodedHigh = decoded / 10_000;
|
||||
|
||||
pdu[lowByteOff] = (byte)(decodedLow >> 8);
|
||||
pdu[lowByteOff + 1] = (byte)(decodedLow & 0xFF);
|
||||
pdu[highByteOff] = (byte)(decodedHigh >> 8);
|
||||
pdu[highByteOff + 1] = (byte)(decodedHigh & 0xFF);
|
||||
ctx.Counters.AddRewrittenSlots(2);
|
||||
}
|
||||
else
|
||||
{
|
||||
// 16-bit tag.
|
||||
if (offsetWords < 0 || offsetWords >= effectiveQty)
|
||||
continue;
|
||||
|
||||
int byteOff = dataOffset + offsetWords * 2;
|
||||
if (byteOff + 2 > pdu.Length)
|
||||
continue;
|
||||
|
||||
ushort raw = (ushort)((pdu[byteOff] << 8) | pdu[byteOff + 1]);
|
||||
|
||||
int decoded;
|
||||
try
|
||||
{
|
||||
decoded = BcdCodec.Decode16(raw);
|
||||
}
|
||||
catch (FormatException)
|
||||
{
|
||||
RewriterLogEvents.InvalidBcd(ctx.Logger, ctx.PlcName, tag.Address, raw, "Read");
|
||||
ctx.Counters.IncrementInvalidBcd();
|
||||
continue;
|
||||
}
|
||||
|
||||
pdu[byteOff] = (byte)(decoded >> 8);
|
||||
pdu[byteOff + 1] = (byte)(decoded & 0xFF);
|
||||
ctx.Counters.AddRewrittenSlots(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// FC06 response: [fc=06][addrHi][addrLo][valHi][valLo] — echoes the register address
|
||||
/// and the value the PLC wrote (which is now BCD-encoded if the request was rewritten).
|
||||
/// Decode the BCD nibbles back to the client's original binary integer so the client
|
||||
/// sees the value it sent and library validation (e.g. NModbus echo-check) passes.
|
||||
/// </summary>
|
||||
private static void ProcessFc06Response(Span<byte> pdu, PerPlcContext ctx)
|
||||
{
|
||||
if (pdu.Length < 5)
|
||||
return;
|
||||
|
||||
ushort address = (ushort)((pdu[1] << 8) | pdu[2]);
|
||||
ushort raw = (ushort)((pdu[3] << 8) | pdu[4]);
|
||||
|
||||
if (!ctx.TagMap.TryGet(address, out var tag))
|
||||
return; // not a BCD address
|
||||
|
||||
if (tag.IsThirtyTwoBit)
|
||||
return; // partial-write echo — pass through (already warned on request)
|
||||
|
||||
// 16-bit tag: the PLC echoed back BCD nibbles. Decode them back to binary.
|
||||
int decoded;
|
||||
try
|
||||
{
|
||||
decoded = BcdCodec.Decode16(raw);
|
||||
}
|
||||
catch (FormatException)
|
||||
{
|
||||
RewriterLogEvents.InvalidBcd(ctx.Logger, ctx.PlcName, address, raw, "Read");
|
||||
ctx.Counters.IncrementInvalidBcd();
|
||||
return;
|
||||
}
|
||||
|
||||
pdu[3] = (byte)(decoded >> 8);
|
||||
pdu[4] = (byte)(decoded & 0xFF);
|
||||
// Note: the RewrittenSlots counter is NOT incremented here because the request
|
||||
// already counted this slot on the way out. Incrementing again would double-count.
|
||||
}
|
||||
|
||||
// ── Helpers ──────────────────────────────────────────────────────────────
|
||||
|
||||
/// <summary>Returns true if any nibble of <paramref name="raw"/> is >= 0xA.</summary>
|
||||
private static bool HasBadNibble(ushort raw)
|
||||
=> ((raw >> 12) & 0xF) >= 0xA
|
||||
|| ((raw >> 8) & 0xF) >= 0xA
|
||||
|| ((raw >> 4) & 0xF) >= 0xA
|
||||
|| (raw & 0xF) >= 0xA;
|
||||
}
|
||||
@@ -0,0 +1,47 @@
|
||||
namespace Mbproxy.Proxy;
|
||||
|
||||
/// <summary>
|
||||
/// Direction of a Modbus PDU being processed by the pipeline.
|
||||
/// </summary>
|
||||
public enum MbapDirection
|
||||
{
|
||||
/// <summary>A request frame travelling from an upstream client to the backend PLC.</summary>
|
||||
RequestToBackend,
|
||||
|
||||
/// <summary>A response frame travelling from the backend PLC back to the upstream client.</summary>
|
||||
ResponseToClient,
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Per-pair context carried through each PDU pipeline call.
|
||||
/// Phase 03: carries only <see cref="PlcName"/>.
|
||||
/// Phase 04 extends this via <see cref="PerPlcContext"/>, which carries the BcdTagMap,
|
||||
/// counters, and logger. Phase 09 added the per-call <c>CurrentRequest</c> slot to
|
||||
/// <see cref="PerPlcContext"/> for multiplexer-aware response correlation.
|
||||
/// </summary>
|
||||
public class PduContext
|
||||
{
|
||||
/// <summary>The configured PLC name (from <c>MbproxyOptions.Plcs[i].Name</c>).</summary>
|
||||
public string PlcName { get; init; } = "";
|
||||
// Phase 04 adds: BcdTagMap, counters, logger
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Hook contract for inspecting and rewriting Modbus PDU bytes inline.
|
||||
/// Called once per frame in each direction (request and response).
|
||||
///
|
||||
/// Implementations must be safe to call concurrently from multiple connection pairs.
|
||||
/// In Phase 03 the only implementation is <see cref="NoopPduPipeline"/> (pass-through).
|
||||
/// Phase 04 replaces it with a BCD rewriter registered via DI.
|
||||
/// </summary>
|
||||
public interface IPduPipeline
|
||||
{
|
||||
/// <summary>
|
||||
/// Processes a single Modbus PDU. Implementations may mutate <paramref name="pdu"/> in place.
|
||||
/// </summary>
|
||||
/// <param name="direction">Whether this is a request or a response frame.</param>
|
||||
/// <param name="mbapHeader">The 7-byte MBAP header (read-only; includes TxId, UnitId, FC is in pdu[0]).</param>
|
||||
/// <param name="pdu">The PDU bytes starting at the function code. May be mutated in place.</param>
|
||||
/// <param name="context">Per-pair context (PLC name; extended in phase 04).</param>
|
||||
void Process(MbapDirection direction, ReadOnlySpan<byte> mbapHeader, Span<byte> pdu, PduContext context);
|
||||
}
|
||||
@@ -0,0 +1,60 @@
|
||||
namespace Mbproxy.Proxy;
|
||||
|
||||
/// <summary>
|
||||
/// Pure, allocation-free helpers for parsing Modbus Application Protocol (MBAP) headers.
|
||||
///
|
||||
/// MBAP frame layout (7-byte header + PDU):
|
||||
/// [0..1] TxId (big-endian uint16)
|
||||
/// [2..3] ProtocolId (big-endian uint16; always 0 for standard Modbus)
|
||||
/// [4..5] Length (big-endian uint16; covers UnitId + PDU bytes)
|
||||
/// [6] UnitId
|
||||
/// [7..] PDU (function code + data); length is (lengthField - 1) bytes
|
||||
///
|
||||
/// Total frame bytes = 6 (fixed header without length's coverage) + lengthField
|
||||
/// = 7 (header) + (lengthField - 1) (PDU body without UnitId).
|
||||
/// </summary>
|
||||
internal static class MbapFrame
|
||||
{
|
||||
/// <summary>Number of bytes in the MBAP header (TxId + ProtocolId + Length + UnitId).</summary>
|
||||
public const int HeaderSize = 7;
|
||||
|
||||
/// <summary>Maximum MBAP PDU body size (Modbus spec max: 253 bytes).</summary>
|
||||
public const int MaxPduBodySize = 253;
|
||||
|
||||
/// <summary>Per-pair buffer size: header (7) + max PDU body (253) = 260 bytes.</summary>
|
||||
public const int BufferSize = HeaderSize + MaxPduBodySize;
|
||||
|
||||
/// <summary>
|
||||
/// Parses all fields from a 7-byte MBAP header buffer.
|
||||
/// Returns <c>false</c> when <paramref name="buffer"/> is shorter than 7 bytes.
|
||||
/// Does NOT validate <paramref name="protocolId"/> or <paramref name="length"/> —
|
||||
/// that is the caller's responsibility (and ultimately the PLC's job).
|
||||
/// </summary>
|
||||
public static bool TryParseHeader(
|
||||
ReadOnlySpan<byte> buffer,
|
||||
out ushort txId,
|
||||
out ushort protocolId,
|
||||
out ushort length,
|
||||
out byte unitId)
|
||||
{
|
||||
if (buffer.Length < HeaderSize)
|
||||
{
|
||||
txId = protocolId = length = 0;
|
||||
unitId = 0;
|
||||
return false;
|
||||
}
|
||||
|
||||
txId = (ushort)((buffer[0] << 8) | buffer[1]);
|
||||
protocolId = (ushort)((buffer[2] << 8) | buffer[3]);
|
||||
length = (ushort)((buffer[4] << 8) | buffer[5]);
|
||||
unitId = buffer[6];
|
||||
return true;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns the total frame length in bytes given the MBAP length field.
|
||||
/// Formula: 6 (TxId + ProtocolId + LengthField bytes) + lengthField
|
||||
/// = 7 (full header) + (lengthField - 1) (PDU body without UnitId).
|
||||
/// </summary>
|
||||
public static int TotalFrameLength(ushort lengthField) => 6 + lengthField;
|
||||
}
|
||||
@@ -0,0 +1,82 @@
|
||||
using System.Collections.Concurrent;
|
||||
|
||||
namespace Mbproxy.Proxy.Multiplexing;
|
||||
|
||||
/// <summary>
|
||||
/// Maps a proxy-assigned MBAP TxId → <see cref="InFlightRequest"/>. The multiplexer's
|
||||
/// per-upstream <c>OnFrame</c> path adds entries; the backend reader task removes them
|
||||
/// when the matching response arrives.
|
||||
///
|
||||
/// <para>Backed by <see cref="ConcurrentDictionary{TKey, TValue}"/>. The single-writer /
|
||||
/// single-remover pattern in Phase 9 does not strictly require it — but cascade-on-
|
||||
/// disconnect walks the map from a separate task and Phase 10 adds upstream-side
|
||||
/// cancellation paths, so the safer primitive is worth the negligible cost.</para>
|
||||
/// </summary>
|
||||
internal sealed class CorrelationMap
|
||||
{
|
||||
private readonly ConcurrentDictionary<ushort, InFlightRequest> _entries = new();
|
||||
|
||||
/// <summary>
|
||||
/// Adds <paramref name="req"/> under <paramref name="proxyTxId"/>. Returns <c>false</c>
|
||||
/// if a request was already stored under that key — which would be a programming
|
||||
/// error (the allocator should never hand out the same key twice while it is still
|
||||
/// in flight). Callers should treat <c>false</c> as a fatal contract violation and
|
||||
/// drop the upstream connection.
|
||||
/// </summary>
|
||||
public bool TryAdd(ushort proxyTxId, InFlightRequest req)
|
||||
=> _entries.TryAdd(proxyTxId, req);
|
||||
|
||||
/// <summary>
|
||||
/// Removes the entry under <paramref name="proxyTxId"/>. Returns <c>false</c> when
|
||||
/// no entry exists (which is normal for cascade cleanup and for stale-response paths).
|
||||
/// </summary>
|
||||
public bool TryRemove(ushort proxyTxId, out InFlightRequest req)
|
||||
=> _entries.TryRemove(proxyTxId, out req!);
|
||||
|
||||
/// <summary>Number of currently-in-flight requests.</summary>
|
||||
public int Count => _entries.Count;
|
||||
|
||||
/// <summary>
|
||||
/// Returns a point-in-time copy of all in-flight requests. Allocates a list; intended
|
||||
/// for diagnostics (cascade walk on backend disconnect; future drain-on-shutdown).
|
||||
/// </summary>
|
||||
public IReadOnlyCollection<InFlightRequest> Snapshot()
|
||||
{
|
||||
// ConcurrentDictionary.Values is a snapshot-safe enumerable; materialise to
|
||||
// detach from the live dictionary and give callers a stable view.
|
||||
return _entries.Values.ToArray();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns and removes every entry. Used by the multiplexer's cascade path when the
|
||||
/// backend socket dies — the multiplexer must close every interested upstream pipe
|
||||
/// and free every allocated proxy TxId.
|
||||
/// </summary>
|
||||
public IReadOnlyList<KeyValuePair<ushort, InFlightRequest>> DrainAll()
|
||||
{
|
||||
var drained = new List<KeyValuePair<ushort, InFlightRequest>>(_entries.Count);
|
||||
foreach (var kvp in _entries)
|
||||
{
|
||||
if (_entries.TryRemove(kvp.Key, out var req))
|
||||
drained.Add(new KeyValuePair<ushort, InFlightRequest>(kvp.Key, req));
|
||||
}
|
||||
return drained;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns a snapshot of (proxyTxId, InFlightRequest) pairs whose <see cref="InFlightRequest.SentAtUtc"/>
|
||||
/// is older than <paramref name="threshold"/>. Allocates a list; intended for the
|
||||
/// periodic per-request timeout watchdog only. The entries are NOT removed by this
|
||||
/// call — the caller decides which to time out.
|
||||
/// </summary>
|
||||
public IReadOnlyList<KeyValuePair<ushort, InFlightRequest>> SnapshotOlderThan(DateTimeOffset threshold)
|
||||
{
|
||||
var stale = new List<KeyValuePair<ushort, InFlightRequest>>();
|
||||
foreach (var kvp in _entries)
|
||||
{
|
||||
if (kvp.Value.SentAtUtc <= threshold)
|
||||
stale.Add(new KeyValuePair<ushort, InFlightRequest>(kvp.Key, kvp.Value));
|
||||
}
|
||||
return stale;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,41 @@
|
||||
namespace Mbproxy.Proxy.Multiplexing;
|
||||
|
||||
/// <summary>
|
||||
/// One upstream party interested in a single backend round-trip. Carries the upstream
|
||||
/// pipe to deliver the response to AND the original MBAP TxId that the party sent — the
|
||||
/// multiplexer must rewrite the response's MBAP TxId back to <see cref="OriginalTxId"/>
|
||||
/// before handing the frame to the pipe, so each upstream sees the proxy as transparent.
|
||||
///
|
||||
/// <para><b>Phase 9 invariant:</b> exactly one <see cref="InterestedParty"/> per
|
||||
/// <see cref="InFlightRequest"/>. <b>Phase 10 (read coalescing)</b> reuses this exact
|
||||
/// shape to fan-out a single backend response to multiple upstream parties. Do not
|
||||
/// collapse this into a single field on <see cref="InFlightRequest"/>.</para>
|
||||
/// </summary>
|
||||
internal sealed record InterestedParty(UpstreamPipe Pipe, ushort OriginalTxId);
|
||||
|
||||
/// <summary>
|
||||
/// Per-backend-request correlation record. Stored in <see cref="CorrelationMap"/> keyed
|
||||
/// by the proxy-assigned TxId; looked up by the backend reader task to:
|
||||
/// <list type="bullet">
|
||||
/// <item><description>Restore each interested party's original MBAP TxId before forwarding
|
||||
/// the response upstream (transparent multiplexing contract).</description></item>
|
||||
/// <item><description>Provide the BCD rewriter with the originating request's
|
||||
/// <c>StartAddress</c> / <c>Qty</c> for FC03/FC04 response decoding — the response
|
||||
/// PDU itself does not carry the start address.</description></item>
|
||||
/// <item><description>Measure backend round-trip time via <see cref="SentAtUtc"/>
|
||||
/// (replaces the per-pair stopwatch slot from the 1:1 model).</description></item>
|
||||
/// </list>
|
||||
///
|
||||
/// <para><b>Phase 9:</b> <see cref="InterestedParties"/> always has exactly one element.
|
||||
/// The list shape is the load-bearing seam that <b>Phase 10 — read coalescing</b> hooks
|
||||
/// into to fan out a single PLC response to multiple upstream clients without further
|
||||
/// refactor of the multiplexer's data model. Reviewer note: do <i>not</i> simplify back
|
||||
/// to a single <c>UpstreamPipe</c> field.</para>
|
||||
/// </summary>
|
||||
internal sealed record InFlightRequest(
|
||||
byte UnitId,
|
||||
byte Fc,
|
||||
ushort StartAddress,
|
||||
ushort Qty,
|
||||
IReadOnlyList<InterestedParty> InterestedParties,
|
||||
DateTimeOffset SentAtUtc);
|
||||
@@ -0,0 +1,121 @@
|
||||
namespace Mbproxy.Proxy.Multiplexing;
|
||||
|
||||
/// <summary>
|
||||
/// Source-generated <see cref="LoggerMessage"/> definitions for the TxId-multiplexing
|
||||
/// connection layer. Event names are stable — do not rename without updating
|
||||
/// docs/design.md's "Logging" event-name table.
|
||||
/// </summary>
|
||||
internal static partial class MultiplexerLogEvents
|
||||
{
|
||||
/// <summary>
|
||||
/// Emitted once per upstream client accept. Replaces the per-pair
|
||||
/// <c>mbproxy.client.connected</c> event from the 1:1 model (same event name,
|
||||
/// same property shape — operators' log queries are unchanged).
|
||||
/// </summary>
|
||||
[LoggerMessage(
|
||||
EventId = 110,
|
||||
EventName = "mbproxy.client.connected",
|
||||
Level = LogLevel.Information,
|
||||
Message = "Client connected: Plc={Plc} RemoteEp={RemoteEp}")]
|
||||
public static partial void ClientConnected(
|
||||
ILogger logger,
|
||||
string plc,
|
||||
string remoteEp);
|
||||
|
||||
/// <summary>
|
||||
/// Emitted when an upstream pipe is closed (clean disconnect, fault, or cascade).
|
||||
/// </summary>
|
||||
[LoggerMessage(
|
||||
EventId = 111,
|
||||
EventName = "mbproxy.client.disconnected",
|
||||
Level = LogLevel.Information,
|
||||
Message = "Client disconnected: Plc={Plc} RemoteEp={RemoteEp} Reason={Reason}")]
|
||||
public static partial void ClientDisconnected(
|
||||
ILogger logger,
|
||||
string plc,
|
||||
string remoteEp,
|
||||
string reason);
|
||||
|
||||
/// <summary>
|
||||
/// Emitted when the multiplexer successfully opens its single backend connection to a PLC.
|
||||
/// </summary>
|
||||
[LoggerMessage(
|
||||
EventId = 112,
|
||||
EventName = "mbproxy.multiplex.backend.connected",
|
||||
Level = LogLevel.Information,
|
||||
Message = "Backend multiplex connection up: Plc={Plc} Host={Host} Port={Port}")]
|
||||
public static partial void BackendConnected(
|
||||
ILogger logger,
|
||||
string plc,
|
||||
string host,
|
||||
int port);
|
||||
|
||||
/// <summary>
|
||||
/// Emitted when the multiplexer cascades a backend disconnect to all attached upstream
|
||||
/// clients. <c>UpstreamCount</c> is the number of upstream pipes that were closed and
|
||||
/// <c>InFlightCount</c> is the number of in-flight requests dropped.
|
||||
/// </summary>
|
||||
[LoggerMessage(
|
||||
EventId = 113,
|
||||
EventName = "mbproxy.multiplex.backend.disconnected",
|
||||
Level = LogLevel.Warning,
|
||||
Message = "Backend multiplex connection down: Plc={Plc} UpstreamCount={UpstreamCount} InFlightCount={InFlightCount} Reason={Reason}")]
|
||||
public static partial void BackendDisconnected(
|
||||
ILogger logger,
|
||||
string plc,
|
||||
int upstreamCount,
|
||||
int inFlightCount,
|
||||
string reason);
|
||||
|
||||
/// <summary>
|
||||
/// Emitted once when the TxId allocator refuses to allocate — every slot in the 16-bit
|
||||
/// space is currently in flight. The multiplexer responds to the upstream with a
|
||||
/// Modbus exception (code 04 / Slave Device Failure). Realistically unreachable under
|
||||
/// normal load (ECOM serializes at ~2-10 ms per request); a stress-only path.
|
||||
/// </summary>
|
||||
[LoggerMessage(
|
||||
EventId = 114,
|
||||
EventName = "mbproxy.multiplex.saturated",
|
||||
Level = LogLevel.Error,
|
||||
Message = "Multiplexer TxId space saturated — returning exception 04 to upstream: Plc={Plc} RemoteEp={RemoteEp}")]
|
||||
public static partial void Saturated(
|
||||
ILogger logger,
|
||||
string plc,
|
||||
string remoteEp);
|
||||
|
||||
/// <summary>
|
||||
/// Emitted when the backend connect Polly pipeline fails. Mirrors the existing
|
||||
/// <c>mbproxy.backend.failed</c> event from the 1:1 model so operators' alerts keep
|
||||
/// working unchanged after Phase 9.
|
||||
/// </summary>
|
||||
[LoggerMessage(
|
||||
EventId = 115,
|
||||
EventName = "mbproxy.backend.failed",
|
||||
Level = LogLevel.Warning,
|
||||
Message = "Backend connect failed: Plc={Plc} Reason={Reason}")]
|
||||
public static partial void BackendFailed(
|
||||
ILogger logger,
|
||||
string plc,
|
||||
string reason);
|
||||
|
||||
/// <summary>
|
||||
/// Emitted when the per-request watchdog times out an in-flight request whose response
|
||||
/// never arrived within <c>BackendRequestTimeoutMs</c>. The upstream party receives a
|
||||
/// Modbus exception (code 0x0B / Gateway Target Device Failed To Respond) and the
|
||||
/// proxy TxId is freed. Causes include: PLC dropped the response, network packet loss,
|
||||
/// or a backend that echoes the wrong MBAP TxId (e.g. pymodbus 3.13.0's
|
||||
/// concurrent-multiplexed-request bug).
|
||||
/// </summary>
|
||||
[LoggerMessage(
|
||||
EventId = 116,
|
||||
EventName = "mbproxy.multiplex.request.timeout",
|
||||
Level = LogLevel.Warning,
|
||||
Message = "In-flight request timed out: Plc={Plc} ProxyTxId={ProxyTxId} OriginalTxId={OriginalTxId} Fc={Fc} ElapsedMs={ElapsedMs}")]
|
||||
public static partial void RequestTimeout(
|
||||
ILogger logger,
|
||||
string plc,
|
||||
ushort proxyTxId,
|
||||
ushort originalTxId,
|
||||
byte fc,
|
||||
long elapsedMs);
|
||||
}
|
||||
@@ -0,0 +1,664 @@
|
||||
using System.Collections.Concurrent;
|
||||
using System.Diagnostics;
|
||||
using System.Net.Sockets;
|
||||
using System.Threading.Channels;
|
||||
using Mbproxy.Options;
|
||||
using Polly;
|
||||
|
||||
namespace Mbproxy.Proxy.Multiplexing;
|
||||
|
||||
/// <summary>
|
||||
/// Owner of the single backend TCP connection to one PLC. Multiplexes many
|
||||
/// <see cref="UpstreamPipe"/> instances onto that one socket by rewriting MBAP transaction
|
||||
/// IDs so concurrent in-flight requests from different upstream clients remain
|
||||
/// distinguishable on the shared wire. The multiplexer:
|
||||
///
|
||||
/// <list type="bullet">
|
||||
/// <item><description>Opens and re-opens the backend socket through a Polly retry pipeline
|
||||
/// that matches the <see cref="ResilienceOptions.BackendConnect"/> profile.</description></item>
|
||||
/// <item><description>Runs one backend writer task that drains <see cref="_outboundChannel"/>
|
||||
/// into the backend socket (single writer; no socket-level synchronisation needed).</description></item>
|
||||
/// <item><description>Runs one backend reader task that decodes MBAP frames from the backend,
|
||||
/// looks each frame up in the <see cref="CorrelationMap"/>, restores each interested
|
||||
/// party's original TxId, and hands the frame to that party's
|
||||
/// <see cref="UpstreamPipe._responseChannel"/>.</description></item>
|
||||
/// <item><description>Cascades a backend disconnect by closing every attached pipe and
|
||||
/// freeing every allocated proxy TxId, then waits for the next upstream request to
|
||||
/// arrive (which triggers a fresh backend connect via Polly).</description></item>
|
||||
/// </list>
|
||||
///
|
||||
/// <para><b>Threading invariants:</b> a single backend writer touches the backend socket
|
||||
/// for sends; a single backend reader touches the same socket for receives. Per-upstream
|
||||
/// read tasks call <see cref="OnUpstreamFrameAsync"/>, which allocates a proxy TxId, queues
|
||||
/// the request frame into <see cref="_outboundChannel"/>, and returns. Upstream-side writes
|
||||
/// flow through each pipe's response channel — never directly through this class.</para>
|
||||
///
|
||||
/// <para><b>Lifecycle:</b> the multiplexer is created with the backend offline. The first
|
||||
/// <see cref="OnUpstreamFrameAsync"/> call (or the first <see cref="Attach"/> if you prefer
|
||||
/// eager-start) triggers backend connect through the Polly pipeline. Subsequent in-flight
|
||||
/// requests reuse the same socket. <see cref="DisposeAsync"/> tears down the backend
|
||||
/// socket, the writer/reader tasks, and every attached pipe.</para>
|
||||
/// </summary>
|
||||
internal sealed class PlcMultiplexer : IAsyncDisposable, IMultiplexCountersProvider
|
||||
{
|
||||
private const int OutboundChannelCapacity = 256;
|
||||
|
||||
private readonly PlcOptions _plc;
|
||||
private readonly ConnectionOptions _connectionOptions;
|
||||
private readonly IPduPipeline _pipeline;
|
||||
private readonly PerPlcContext _ctx;
|
||||
private readonly ILogger<PlcMultiplexer> _logger;
|
||||
private readonly ResiliencePipeline? _backendConnectPipeline;
|
||||
|
||||
private readonly TxIdAllocator _allocator = new();
|
||||
private readonly CorrelationMap _correlation = new();
|
||||
|
||||
private readonly Channel<byte[]> _outboundChannel = Channel.CreateBounded<byte[]>(
|
||||
new BoundedChannelOptions(OutboundChannelCapacity)
|
||||
{
|
||||
FullMode = BoundedChannelFullMode.Wait,
|
||||
SingleReader = true,
|
||||
SingleWriter = false,
|
||||
});
|
||||
|
||||
// Attached pipes — Phase 9 needs the list for the status page; Phase 10 will need it for
|
||||
// coalescing (fan-out). ConcurrentDictionary keyed on UpstreamPipe.Id for O(1) detach.
|
||||
private readonly ConcurrentDictionary<Guid, UpstreamPipe> _pipes = new();
|
||||
|
||||
// Lifecycle plumbing. Backend tasks share a CTS; cascading disconnect cancels it,
|
||||
// which terminates both the writer and reader tasks. The next call to
|
||||
// EnsureBackendConnectedAsync constructs a fresh CTS and a fresh backend socket.
|
||||
private readonly object _backendLock = new();
|
||||
private Socket? _backendSocket;
|
||||
private CancellationTokenSource? _backendCts;
|
||||
private Task? _backendWriterTask;
|
||||
private Task? _backendReaderTask;
|
||||
|
||||
private readonly CancellationTokenSource _disposeCts = new();
|
||||
private bool _disposed;
|
||||
private Task? _watchdogTask;
|
||||
|
||||
public PlcMultiplexer(
|
||||
PlcOptions plc,
|
||||
ConnectionOptions connectionOptions,
|
||||
IPduPipeline pipeline,
|
||||
PerPlcContext perPlcContext,
|
||||
ILogger<PlcMultiplexer> logger,
|
||||
ResiliencePipeline? backendConnectPipeline = null)
|
||||
{
|
||||
_plc = plc;
|
||||
_connectionOptions = connectionOptions;
|
||||
_pipeline = pipeline;
|
||||
_ctx = perPlcContext;
|
||||
_logger = logger;
|
||||
_backendConnectPipeline = backendConnectPipeline;
|
||||
|
||||
// Register this multiplexer as the live telemetry source for the PLC's counters.
|
||||
_ctx.Counters.SetMultiplexProvider(this);
|
||||
|
||||
// Spin up the per-request timeout watchdog. It scans the correlation map at a fixed
|
||||
// interval and times out any in-flight request older than BackendRequestTimeoutMs.
|
||||
// Critical for: lost responses, dead-PLC paths, and backends that mis-echo TxIds
|
||||
// (e.g. pymodbus 3.13.0's concurrent-multiplexed-request bug — see test files).
|
||||
_watchdogTask = Task.Run(() => RunRequestTimeoutWatchdogAsync(_disposeCts.Token), CancellationToken.None);
|
||||
}
|
||||
|
||||
// ── IMultiplexCountersProvider ────────────────────────────────────────────
|
||||
|
||||
public long InFlightCount => _allocator.InFlightCount;
|
||||
public long TxIdWraps => _allocator.WrapCount;
|
||||
public long BackendQueueDepth => _outboundChannel.Reader.Count;
|
||||
|
||||
// ── Public surface ────────────────────────────────────────────────────────
|
||||
|
||||
/// <summary>
|
||||
/// Read-only collection of currently-attached upstream pipes. Used by the status page.
|
||||
/// </summary>
|
||||
public IReadOnlyCollection<UpstreamPipe> AttachedPipes => _pipes.Values.ToArray();
|
||||
|
||||
/// <summary>
|
||||
/// Attaches an upstream pipe to this multiplexer. The caller is responsible for
|
||||
/// running the pipe's read+write loops (typically via <see cref="StartPipeAsync"/>)
|
||||
/// which wires the pipe's OnFrame callback back into <see cref="OnUpstreamFrameAsync"/>.
|
||||
/// </summary>
|
||||
public void Attach(UpstreamPipe pipe)
|
||||
{
|
||||
if (_disposed)
|
||||
throw new ObjectDisposedException(nameof(PlcMultiplexer));
|
||||
|
||||
_pipes[pipe.Id] = pipe;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Starts the read+write tasks for <paramref name="pipe"/> and returns a task that
|
||||
/// completes when the pipe's read loop ends. The multiplexer detaches the pipe when
|
||||
/// its read loop returns.
|
||||
/// </summary>
|
||||
public Task StartPipeAsync(UpstreamPipe pipe, CancellationToken ct)
|
||||
{
|
||||
Attach(pipe);
|
||||
|
||||
// The write loop runs to completion when the pipe is disposed or the channel
|
||||
// completes. We don't await it directly — it's joined inside DisposeAsync of the pipe.
|
||||
_ = Task.Run(() => pipe.RunWriteLoopAsync(ct), CancellationToken.None);
|
||||
|
||||
var readLoop = pipe.RunReadLoopAsync(
|
||||
(frame, frameCt) => OnUpstreamFrameAsync(pipe, frame, frameCt),
|
||||
ct);
|
||||
|
||||
// When the pipe's read loop finishes, detach it. Don't dispose it here; the
|
||||
// listener (or the cascade walker) owns disposal.
|
||||
_ = readLoop.ContinueWith(prev =>
|
||||
{
|
||||
_pipes.TryRemove(pipe.Id, out _);
|
||||
}, TaskScheduler.Default);
|
||||
|
||||
return readLoop;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Tears down the multiplexer: closes the backend connection, cancels both backend
|
||||
/// tasks, drains every in-flight correlation entry, and closes every attached pipe.
|
||||
/// </summary>
|
||||
public async ValueTask DisposeAsync()
|
||||
{
|
||||
if (_disposed) return;
|
||||
_disposed = true;
|
||||
|
||||
// Stop the counters provider link so a status snapshot during teardown doesn't
|
||||
// see live-but-soon-to-be-empty internal state.
|
||||
_ctx.Counters.SetMultiplexProvider(null);
|
||||
|
||||
await _disposeCts.CancelAsync().ConfigureAwait(false);
|
||||
|
||||
// Best-effort join the watchdog so its in-flight log/dispatch settles before tests
|
||||
// assert on counter state.
|
||||
if (_watchdogTask is not null)
|
||||
{
|
||||
try { await _watchdogTask.WaitAsync(TimeSpan.FromSeconds(2)).ConfigureAwait(false); }
|
||||
catch { /* swallow */ }
|
||||
}
|
||||
|
||||
await TearDownBackendAsync("disposing", cascadeUpstreams: true).ConfigureAwait(false);
|
||||
_outboundChannel.Writer.TryComplete();
|
||||
|
||||
// Dispose all attached pipes.
|
||||
foreach (var pipe in _pipes.Values)
|
||||
{
|
||||
try { await pipe.DisposeAsync().ConfigureAwait(false); } catch { /* best effort */ }
|
||||
}
|
||||
_pipes.Clear();
|
||||
|
||||
_disposeCts.Dispose();
|
||||
}
|
||||
|
||||
// ── Backend connect / teardown ────────────────────────────────────────────
|
||||
|
||||
private async Task<bool> EnsureBackendConnectedAsync(CancellationToken ct)
|
||||
{
|
||||
if (_disposed) return false;
|
||||
|
||||
// Fast path: already connected.
|
||||
if (_backendSocket is { Connected: true } && _backendCts is { IsCancellationRequested: false })
|
||||
return true;
|
||||
|
||||
// Serialise concurrent connect attempts from many upstream pipes.
|
||||
await _connectGate.WaitAsync(ct).ConfigureAwait(false);
|
||||
try
|
||||
{
|
||||
// Re-check after acquiring the gate.
|
||||
if (_backendSocket is { Connected: true } && _backendCts is { IsCancellationRequested: false })
|
||||
return true;
|
||||
|
||||
// Build a fresh backend socket and Polly-connect.
|
||||
var backend = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp)
|
||||
{ NoDelay = true };
|
||||
|
||||
try
|
||||
{
|
||||
if (_backendConnectPipeline is not null)
|
||||
{
|
||||
await _backendConnectPipeline.ExecuteAsync(async attemptToken =>
|
||||
{
|
||||
using var cts = CancellationTokenSource.CreateLinkedTokenSource(attemptToken);
|
||||
cts.CancelAfter(_connectionOptions.BackendConnectTimeoutMs);
|
||||
await backend.ConnectAsync(_plc.Host, _plc.Port, cts.Token).ConfigureAwait(false);
|
||||
}, ct).ConfigureAwait(false);
|
||||
}
|
||||
else
|
||||
{
|
||||
using var connectCts = CancellationTokenSource.CreateLinkedTokenSource(ct);
|
||||
connectCts.CancelAfter(_connectionOptions.BackendConnectTimeoutMs);
|
||||
await backend.ConnectAsync(_plc.Host, _plc.Port, connectCts.Token).ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
string reason = ex is OperationCanceledException
|
||||
? $"Backend connect timed out or cancelled after {_connectionOptions.BackendConnectTimeoutMs} ms"
|
||||
: ex.Message;
|
||||
MultiplexerLogEvents.BackendFailed(_logger, _plc.Name, reason);
|
||||
_ctx.Counters.IncrementConnectFailed();
|
||||
backend.Dispose();
|
||||
return false;
|
||||
}
|
||||
|
||||
// Successful connect. Wire up the backend tasks.
|
||||
var cts2 = CancellationTokenSource.CreateLinkedTokenSource(_disposeCts.Token);
|
||||
lock (_backendLock)
|
||||
{
|
||||
_backendSocket = backend;
|
||||
_backendCts = cts2;
|
||||
_backendWriterTask = Task.Run(() => RunBackendWriterAsync(backend, cts2.Token), CancellationToken.None);
|
||||
_backendReaderTask = Task.Run(() => RunBackendReaderAsync(backend, cts2.Token), CancellationToken.None);
|
||||
}
|
||||
|
||||
_ctx.Counters.IncrementConnectSuccess();
|
||||
MultiplexerLogEvents.BackendConnected(_logger, _plc.Name, _plc.Host, _plc.Port);
|
||||
return true;
|
||||
}
|
||||
finally
|
||||
{
|
||||
_connectGate.Release();
|
||||
}
|
||||
}
|
||||
|
||||
private readonly SemaphoreSlim _connectGate = new(1, 1);
|
||||
|
||||
private async Task TearDownBackendAsync(string reason, bool cascadeUpstreams)
|
||||
{
|
||||
Socket? oldSocket;
|
||||
CancellationTokenSource? oldCts;
|
||||
Task? writer, reader;
|
||||
lock (_backendLock)
|
||||
{
|
||||
oldSocket = _backendSocket;
|
||||
oldCts = _backendCts;
|
||||
writer = _backendWriterTask;
|
||||
reader = _backendReaderTask;
|
||||
|
||||
_backendSocket = null;
|
||||
_backendCts = null;
|
||||
_backendWriterTask = null;
|
||||
_backendReaderTask = null;
|
||||
}
|
||||
|
||||
if (oldSocket is null && oldCts is null) return;
|
||||
|
||||
try { oldCts?.Cancel(); } catch { /* best effort */ }
|
||||
|
||||
try { oldSocket?.Shutdown(SocketShutdown.Both); } catch { /* already closed */ }
|
||||
try { oldSocket?.Dispose(); } catch { /* best effort */ }
|
||||
|
||||
// Drain correlation map; cascade-close every interested upstream pipe.
|
||||
var dropped = _correlation.DrainAll();
|
||||
var cascadeIds = new HashSet<Guid>();
|
||||
|
||||
foreach (var kvp in dropped)
|
||||
{
|
||||
_allocator.Release(kvp.Key);
|
||||
foreach (var party in kvp.Value.InterestedParties)
|
||||
cascadeIds.Add(party.Pipe.Id);
|
||||
}
|
||||
|
||||
int upstreamCount = 0;
|
||||
if (cascadeUpstreams)
|
||||
{
|
||||
// Close every attached pipe that had a request in flight; the others will
|
||||
// simply re-issue on next request through a fresh backend connect.
|
||||
// Per the design doc, ALL attached upstreams cascade on backend disconnect.
|
||||
upstreamCount = _pipes.Count;
|
||||
|
||||
// Snapshot keys before disposal modifies the dictionary indirectly.
|
||||
var pipeList = _pipes.Values.ToArray();
|
||||
foreach (var pipe in pipeList)
|
||||
{
|
||||
try { await pipe.DisposeAsync().ConfigureAwait(false); }
|
||||
catch { /* best effort */ }
|
||||
}
|
||||
_pipes.Clear();
|
||||
|
||||
_ctx.Counters.AddDisconnectCascades(upstreamCount);
|
||||
}
|
||||
|
||||
// Best-effort join.
|
||||
try { if (writer is not null) await writer.WaitAsync(TimeSpan.FromSeconds(2)).ConfigureAwait(false); } catch { /* swallow */ }
|
||||
try { if (reader is not null) await reader.WaitAsync(TimeSpan.FromSeconds(2)).ConfigureAwait(false); } catch { /* swallow */ }
|
||||
|
||||
oldCts?.Dispose();
|
||||
|
||||
if (upstreamCount > 0 || dropped.Count > 0)
|
||||
MultiplexerLogEvents.BackendDisconnected(_logger, _plc.Name, upstreamCount, dropped.Count, reason);
|
||||
}
|
||||
|
||||
// ── Backend writer / reader tasks ─────────────────────────────────────────
|
||||
|
||||
private async Task RunBackendWriterAsync(Socket backend, CancellationToken ct)
|
||||
{
|
||||
try
|
||||
{
|
||||
await foreach (var frame in _outboundChannel.Reader.ReadAllAsync(ct).ConfigureAwait(false))
|
||||
{
|
||||
int sent = 0;
|
||||
while (sent < frame.Length)
|
||||
{
|
||||
int n = await backend.SendAsync(
|
||||
frame.AsMemory(sent, frame.Length - sent),
|
||||
SocketFlags.None,
|
||||
ct).ConfigureAwait(false);
|
||||
if (n == 0) throw new SocketException((int)SocketError.ConnectionReset);
|
||||
sent += n;
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (OperationCanceledException)
|
||||
{
|
||||
// Normal teardown.
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
// Backend failure — cascade.
|
||||
_ = TearDownBackendAsync($"writer fault: {ex.Message}", cascadeUpstreams: true);
|
||||
}
|
||||
}
|
||||
|
||||
private async Task RunBackendReaderAsync(Socket backend, CancellationToken ct)
|
||||
{
|
||||
byte[] headerBuf = new byte[MbapFrame.HeaderSize];
|
||||
try
|
||||
{
|
||||
while (!ct.IsCancellationRequested)
|
||||
{
|
||||
if (!await FillAsync(backend, headerBuf, 0, MbapFrame.HeaderSize, ct).ConfigureAwait(false))
|
||||
break;
|
||||
|
||||
if (!MbapFrame.TryParseHeader(headerBuf.AsSpan(),
|
||||
out ushort proxyTxId, out _, out ushort length, out _))
|
||||
break;
|
||||
|
||||
if (length < 1)
|
||||
{
|
||||
// Degenerate frame — drop.
|
||||
continue;
|
||||
}
|
||||
|
||||
int pduBodyLen = length - 1;
|
||||
if (pduBodyLen > MbapFrame.MaxPduBodySize)
|
||||
{
|
||||
// Frame too large — backend is misbehaving; force teardown.
|
||||
_logger.LogWarning(
|
||||
"Oversized backend frame: Plc={Plc} PduBody={Body} > Max={Max}",
|
||||
_plc.Name, pduBodyLen, MbapFrame.MaxPduBodySize);
|
||||
break;
|
||||
}
|
||||
|
||||
byte[] frame = new byte[MbapFrame.HeaderSize + pduBodyLen];
|
||||
Buffer.BlockCopy(headerBuf, 0, frame, 0, MbapFrame.HeaderSize);
|
||||
|
||||
if (!await FillAsync(backend, frame, MbapFrame.HeaderSize, pduBodyLen, ct).ConfigureAwait(false))
|
||||
break;
|
||||
|
||||
if (!_correlation.TryRemove(proxyTxId, out var inFlight))
|
||||
{
|
||||
// No correlation entry — either a stale response after cascade, or
|
||||
// the PLC sent something unsolicited. Drop the frame.
|
||||
continue;
|
||||
}
|
||||
|
||||
// Free the allocator slot immediately so it can be reused.
|
||||
_allocator.Release(proxyTxId);
|
||||
|
||||
// Update EWMA round-trip from when we sent the request.
|
||||
long elapsedMs = (DateTimeOffset.UtcNow - inFlight.SentAtUtc).Ticks * 100; // 100 ns per tick
|
||||
// UpdateRoundTripEwma expects Stopwatch ticks, but we have wall-clock.
|
||||
// Convert ms back to Stopwatch ticks:
|
||||
long ticks = (long)((double)(DateTimeOffset.UtcNow - inFlight.SentAtUtc).TotalSeconds * Stopwatch.Frequency);
|
||||
if (ticks > 0)
|
||||
_ctx.Counters.UpdateRoundTripEwma(ticks);
|
||||
|
||||
// Apply the BCD rewriter on the response. Build a per-call context clone
|
||||
// that carries CurrentRequest so the rewriter can decode FC03/04 slots.
|
||||
var responseCtx = _ctx.WithCurrentRequest(inFlight);
|
||||
_pipeline.Process(
|
||||
MbapDirection.ResponseToClient,
|
||||
frame.AsSpan(0, MbapFrame.HeaderSize),
|
||||
frame.AsSpan(MbapFrame.HeaderSize, pduBodyLen),
|
||||
responseCtx);
|
||||
|
||||
// Fan out to each interested party with their original TxId restored.
|
||||
// Phase 9: always exactly one party. Phase 10: N parties (read coalescing).
|
||||
foreach (var party in inFlight.InterestedParties)
|
||||
{
|
||||
if (!party.Pipe.IsAlive)
|
||||
continue;
|
||||
|
||||
// The frame buffer is private to this iteration; if there are multiple
|
||||
// parties (Phase 10), each gets its own copy with its own original TxId
|
||||
// patched in. Phase 9 always has Count == 1, so the single-buffer path
|
||||
// is the common case; we copy to keep Phase-10 forward compatibility.
|
||||
byte[] outFrame = inFlight.InterestedParties.Count == 1
|
||||
? frame
|
||||
: (byte[])frame.Clone();
|
||||
|
||||
outFrame[0] = (byte)(party.OriginalTxId >> 8);
|
||||
outFrame[1] = (byte)(party.OriginalTxId & 0xFF);
|
||||
|
||||
await party.Pipe.SendResponseAsync(outFrame, ct).ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
|
||||
// Reader exited cleanly — backend closed by remote. Cascade.
|
||||
_ = TearDownBackendAsync("backend reader EOF", cascadeUpstreams: true);
|
||||
}
|
||||
catch (OperationCanceledException)
|
||||
{
|
||||
// Normal teardown.
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_ = TearDownBackendAsync($"reader fault: {ex.Message}", cascadeUpstreams: true);
|
||||
}
|
||||
}
|
||||
|
||||
// ── Upstream → multiplexer entry point ────────────────────────────────────
|
||||
|
||||
private async ValueTask OnUpstreamFrameAsync(UpstreamPipe pipe, byte[] frame, CancellationToken ct)
|
||||
{
|
||||
if (_disposed) return;
|
||||
|
||||
// Ensure backend is connected. Failure here means we cannot service the request;
|
||||
// close the upstream pipe (consistent with the 1:1 model's behaviour on connect
|
||||
// failure).
|
||||
if (!await EnsureBackendConnectedAsync(ct).ConfigureAwait(false))
|
||||
{
|
||||
try { await pipe.DisposeAsync().ConfigureAwait(false); } catch { /* best effort */ }
|
||||
return;
|
||||
}
|
||||
|
||||
if (frame.Length < MbapFrame.HeaderSize)
|
||||
return;
|
||||
|
||||
if (!MbapFrame.TryParseHeader(frame.AsSpan(0, MbapFrame.HeaderSize),
|
||||
out ushort originalTxId, out _, out _, out byte unitId))
|
||||
return;
|
||||
|
||||
if (!_allocator.TryAllocate(out ushort proxyTxId))
|
||||
{
|
||||
MultiplexerLogEvents.Saturated(_logger, _plc.Name, pipe.RemoteEp?.ToString() ?? "?");
|
||||
// Synthesize Modbus exception 04 (Slave Device Failure).
|
||||
byte fc = frame.Length > MbapFrame.HeaderSize ? frame[MbapFrame.HeaderSize] : (byte)0;
|
||||
byte[] excFrame = BuildExceptionFrame(originalTxId, unitId, fc, exceptionCode: 4);
|
||||
await pipe.SendResponseAsync(excFrame, ct).ConfigureAwait(false);
|
||||
return;
|
||||
}
|
||||
|
||||
// Parse the PDU FC + start/qty (for FC03/04) so the response decoder has the
|
||||
// correlation it needs.
|
||||
int pduOffset = MbapFrame.HeaderSize;
|
||||
byte fcByte = frame[pduOffset];
|
||||
ushort startAddr = 0;
|
||||
ushort qty = 0;
|
||||
if (fcByte is 0x03 or 0x04 && frame.Length >= pduOffset + 5)
|
||||
{
|
||||
startAddr = (ushort)((frame[pduOffset + 1] << 8) | frame[pduOffset + 2]);
|
||||
qty = (ushort)((frame[pduOffset + 3] << 8) | frame[pduOffset + 4]);
|
||||
}
|
||||
|
||||
var inFlight = new InFlightRequest(
|
||||
UnitId: unitId,
|
||||
Fc: fcByte,
|
||||
StartAddress: startAddr,
|
||||
Qty: qty,
|
||||
InterestedParties: [new InterestedParty(pipe, originalTxId)],
|
||||
SentAtUtc: DateTimeOffset.UtcNow);
|
||||
|
||||
if (!_correlation.TryAdd(proxyTxId, inFlight))
|
||||
{
|
||||
// Should be impossible: the allocator just guaranteed proxyTxId is free.
|
||||
_allocator.Release(proxyTxId);
|
||||
_logger.LogError("CorrelationMap.TryAdd failed for already-free proxyTxId {ProxyTxId}", proxyTxId);
|
||||
return;
|
||||
}
|
||||
|
||||
// Peak in-flight tracking.
|
||||
_ctx.Counters.ObserveInFlight(_allocator.InFlightCount);
|
||||
|
||||
// Apply the BCD rewriter on the request. Use a per-call context with CurrentRequest
|
||||
// (the rewriter doesn't currently need it on request, but Phase 10 may).
|
||||
var requestCtx = _ctx.WithCurrentRequest(inFlight);
|
||||
_pipeline.Process(
|
||||
MbapDirection.RequestToBackend,
|
||||
frame.AsSpan(0, MbapFrame.HeaderSize),
|
||||
frame.AsSpan(MbapFrame.HeaderSize, frame.Length - MbapFrame.HeaderSize),
|
||||
requestCtx);
|
||||
|
||||
// Overwrite the MBAP TxId with the proxy TxId.
|
||||
frame[0] = (byte)(proxyTxId >> 8);
|
||||
frame[1] = (byte)(proxyTxId & 0xFF);
|
||||
|
||||
// Enqueue for the backend writer task.
|
||||
try
|
||||
{
|
||||
await _outboundChannel.Writer.WriteAsync(frame, ct).ConfigureAwait(false);
|
||||
}
|
||||
catch (ChannelClosedException)
|
||||
{
|
||||
// Channel completed during shutdown — release the proxy TxId.
|
||||
if (_correlation.TryRemove(proxyTxId, out _))
|
||||
_allocator.Release(proxyTxId);
|
||||
}
|
||||
}
|
||||
|
||||
// ── Per-request timeout watchdog ──────────────────────────────────────────
|
||||
|
||||
/// <summary>
|
||||
/// Periodically scans the correlation map for in-flight requests whose response has
|
||||
/// not arrived within <see cref="ConnectionOptions.BackendRequestTimeoutMs"/>. For each
|
||||
/// stale entry: removes it from the map, frees its allocator slot, and delivers a
|
||||
/// Modbus exception (code 0x0B / Gateway Target Device Failed To Respond) to each
|
||||
/// interested party with the original TxId restored.
|
||||
///
|
||||
/// <para><b>Why this exists.</b> In the 1:1 connection model, a lost response would
|
||||
/// fault the dedicated backend socket and the upstream pair would close. The multiplexed
|
||||
/// model needs an explicit per-request timer because a single missing or mis-routed
|
||||
/// response would otherwise leak a correlation entry forever and hang the upstream
|
||||
/// pipe indefinitely. Real-world causes: PLC drops a response, network packet loss,
|
||||
/// backend that mis-echoes MBAP TxIds.</para>
|
||||
/// </summary>
|
||||
private async Task RunRequestTimeoutWatchdogAsync(CancellationToken ct)
|
||||
{
|
||||
// Tick at ~quarter of the request timeout for responsive cleanup, but cap to a
|
||||
// 1-second floor so the watchdog doesn't busy-wake on very small timeouts.
|
||||
int tickMs = Math.Max(100, _connectionOptions.BackendRequestTimeoutMs / 4);
|
||||
|
||||
try
|
||||
{
|
||||
while (!ct.IsCancellationRequested)
|
||||
{
|
||||
await Task.Delay(tickMs, ct).ConfigureAwait(false);
|
||||
|
||||
var threshold = DateTimeOffset.UtcNow.AddMilliseconds(-_connectionOptions.BackendRequestTimeoutMs);
|
||||
var stale = _correlation.SnapshotOlderThan(threshold);
|
||||
if (stale.Count == 0) continue;
|
||||
|
||||
foreach (var kvp in stale)
|
||||
{
|
||||
ushort proxyTxId = kvp.Key;
|
||||
// Try to claim the entry; if another path (response, cascade) already removed it,
|
||||
// skip — no work to do.
|
||||
if (!_correlation.TryRemove(proxyTxId, out var req))
|
||||
continue;
|
||||
|
||||
_allocator.Release(proxyTxId);
|
||||
|
||||
long elapsedMs = (long)(DateTimeOffset.UtcNow - req.SentAtUtc).TotalMilliseconds;
|
||||
|
||||
foreach (var party in req.InterestedParties)
|
||||
{
|
||||
MultiplexerLogEvents.RequestTimeout(
|
||||
_logger, _plc.Name, proxyTxId, party.OriginalTxId, req.Fc, elapsedMs);
|
||||
|
||||
if (!party.Pipe.IsAlive)
|
||||
continue;
|
||||
|
||||
// Deliver Modbus exception 0x0B (Gateway Target Device Failed To Respond)
|
||||
// to the upstream client. This lets the client's library raise a clean
|
||||
// ModbusException rather than hanging on a timeout.
|
||||
byte[] excFrame = BuildExceptionFrame(party.OriginalTxId, req.UnitId, req.Fc, exceptionCode: 0x0B);
|
||||
try
|
||||
{
|
||||
await party.Pipe.SendResponseAsync(excFrame, ct).ConfigureAwait(false);
|
||||
}
|
||||
catch
|
||||
{
|
||||
// Best-effort delivery; if the pipe is going down, the client
|
||||
// discovers the failure through its own socket close path.
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (OperationCanceledException)
|
||||
{
|
||||
// Normal teardown.
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Request-timeout watchdog faulted: Plc={Plc}", _plc.Name);
|
||||
}
|
||||
}
|
||||
|
||||
// ── Helpers ───────────────────────────────────────────────────────────────
|
||||
|
||||
private static async Task<bool> FillAsync(
|
||||
Socket socket, byte[] buf, int offset, int count, CancellationToken ct)
|
||||
{
|
||||
int remaining = count;
|
||||
while (remaining > 0)
|
||||
{
|
||||
int n = await socket.ReceiveAsync(
|
||||
buf.AsMemory(offset + (count - remaining), remaining),
|
||||
SocketFlags.None, ct).ConfigureAwait(false);
|
||||
if (n == 0) return false;
|
||||
remaining -= n;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private static byte[] BuildExceptionFrame(ushort originalTxId, byte unitId, byte fc, byte exceptionCode)
|
||||
{
|
||||
// Modbus exception PDU = [fc | 0x80][exceptionCode].
|
||||
// MBAP length covers UnitId (1) + PDU (2) = 3.
|
||||
var frame = new byte[MbapFrame.HeaderSize + 2];
|
||||
frame[0] = (byte)(originalTxId >> 8);
|
||||
frame[1] = (byte)(originalTxId & 0xFF);
|
||||
frame[2] = 0; // ProtocolId
|
||||
frame[3] = 0;
|
||||
frame[4] = 0; // Length high
|
||||
frame[5] = 3; // Length low: UnitId(1) + ExFc(1) + ExCode(1)
|
||||
frame[6] = unitId;
|
||||
frame[7] = (byte)(fc | 0x80);
|
||||
frame[8] = exceptionCode;
|
||||
return frame;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,142 @@
|
||||
namespace Mbproxy.Proxy.Multiplexing;
|
||||
|
||||
/// <summary>
|
||||
/// Allocates 16-bit MBAP transaction IDs (proxy TxIds) used to multiplex many upstream
|
||||
/// clients onto a single shared backend connection per PLC. The allocator tracks which
|
||||
/// IDs are currently in flight and scans forward from a rolling cursor to find the next
|
||||
/// free slot, mimicking the natural cadence of Modbus clients while keeping reuse
|
||||
/// distance maximally large in steady state.
|
||||
///
|
||||
/// <para>State is protected by a single <see cref="object"/> lock. Contention is
|
||||
/// negligible in practice — the allocator is per-PLC and one PLC's wire rate is bounded
|
||||
/// by the controller's internal scan time (a few ms per request on an H2-ECOM100).
|
||||
/// The lock is preferred over a lock-free approach for readability and worst-case
|
||||
/// determinism (Polly retries, cascade cleanup, and saturation paths must not race).</para>
|
||||
///
|
||||
/// <para><b>Memory:</b> <c>bool[65536]</c> (~64 KB) per PLC. With ~54 PLCs that is
|
||||
/// ~3.4 MB total — well within budget for a service that already ships at ~30 MB working
|
||||
/// set under load.</para>
|
||||
///
|
||||
/// <para><b>Wrap counter:</b> increments every time the rolling cursor rolls over
|
||||
/// 0xFFFF → 0x0000 during a successful allocation scan. Frequent wraps indicate either
|
||||
/// very high churn or extreme in-flight depth and are surfaced as a telemetry signal,
|
||||
/// not an error.</para>
|
||||
/// </summary>
|
||||
internal sealed class TxIdAllocator
|
||||
{
|
||||
// 65,536 slots total — the full uint16 space.
|
||||
private const int SlotCount = 65536;
|
||||
|
||||
private readonly object _lock = new();
|
||||
private readonly bool[] _inUse = new bool[SlotCount];
|
||||
private ushort _next; // rolling cursor; 0 on construction
|
||||
private int _inFlightCount; // 0..65536
|
||||
private long _wrapCount; // monotonic; never resets
|
||||
|
||||
/// <summary>
|
||||
/// Number of currently-in-flight proxy TxIds (i.e., allocated but not yet released).
|
||||
/// Read under the same lock that mutates it; the snapshot is a simple atomic read of
|
||||
/// an int but we still hold the lock for cross-field consistency with <c>_inUse</c>.
|
||||
/// </summary>
|
||||
public int InFlightCount
|
||||
{
|
||||
get
|
||||
{
|
||||
lock (_lock)
|
||||
{
|
||||
return _inFlightCount;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Number of times the rolling cursor has wrapped 0xFFFF → 0x0000 during a
|
||||
/// successful allocation since the allocator was constructed. Read without locking
|
||||
/// via <see cref="Interlocked.Read"/> for the hot status-page path.
|
||||
/// </summary>
|
||||
public long WrapCount => Interlocked.Read(ref _wrapCount);
|
||||
|
||||
/// <summary>
|
||||
/// Attempts to allocate the next free proxy TxId.
|
||||
/// Returns <c>true</c> with <paramref name="id"/> set when an ID was allocated.
|
||||
/// Returns <c>false</c> when every slot in the 16-bit space is currently in use;
|
||||
/// the caller is responsible for emitting <c>mbproxy.multiplex.saturated</c> and
|
||||
/// returning a Modbus exception (code 04 / Slave Device Failure) to the upstream.
|
||||
/// </summary>
|
||||
public bool TryAllocate(out ushort id)
|
||||
{
|
||||
lock (_lock)
|
||||
{
|
||||
if (_inFlightCount >= SlotCount)
|
||||
{
|
||||
id = 0;
|
||||
return false;
|
||||
}
|
||||
|
||||
// Scan forward from _next for the next free slot. _inFlightCount < SlotCount
|
||||
// guarantees at least one free slot, so the loop terminates within at most
|
||||
// SlotCount iterations even in the pathological full-minus-one case.
|
||||
ushort start = _next;
|
||||
ushort cursor = start;
|
||||
do
|
||||
{
|
||||
if (!_inUse[cursor])
|
||||
{
|
||||
_inUse[cursor] = true;
|
||||
_inFlightCount++;
|
||||
|
||||
// Advance the cursor; track wrap.
|
||||
unchecked
|
||||
{
|
||||
ushort nextCursor = (ushort)(cursor + 1);
|
||||
if (nextCursor == 0)
|
||||
Interlocked.Increment(ref _wrapCount);
|
||||
_next = nextCursor;
|
||||
}
|
||||
|
||||
id = cursor;
|
||||
return true;
|
||||
}
|
||||
|
||||
unchecked
|
||||
{
|
||||
cursor = (ushort)(cursor + 1);
|
||||
}
|
||||
}
|
||||
while (cursor != start);
|
||||
|
||||
// Defensive: should be unreachable given the InFlightCount check above.
|
||||
id = 0;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Releases a previously-allocated proxy TxId. Releasing an ID that is not currently
|
||||
/// allocated is a no-op (defensive: cascade-on-disconnect can call <see cref="Release"/>
|
||||
/// after a concurrent timeout path has already done so).
|
||||
/// </summary>
|
||||
public void Release(ushort id)
|
||||
{
|
||||
lock (_lock)
|
||||
{
|
||||
if (_inUse[id])
|
||||
{
|
||||
_inUse[id] = false;
|
||||
_inFlightCount--;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Test-only: returns whether the given proxy TxId is currently marked in use.
|
||||
/// Internal so it remains usable from unit tests via InternalsVisibleTo.
|
||||
/// </summary>
|
||||
internal bool IsAllocated(ushort id)
|
||||
{
|
||||
lock (_lock)
|
||||
{
|
||||
return _inUse[id];
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,281 @@
|
||||
using System.Net;
|
||||
using System.Net.Sockets;
|
||||
using System.Threading.Channels;
|
||||
|
||||
namespace Mbproxy.Proxy.Multiplexing;
|
||||
|
||||
/// <summary>
|
||||
/// One accepted upstream client socket, exposed as an asynchronous frame pipe to the
|
||||
/// owning <see cref="PlcMultiplexer"/>. The pipe reads complete MBAP frames from the
|
||||
/// upstream socket and hands each frame to a multiplexer-supplied <c>onFrame</c> callback;
|
||||
/// it also exposes a write channel that the multiplexer drains to send response frames
|
||||
/// back to the upstream client.
|
||||
///
|
||||
/// <para><b>Lifecycle:</b> constructed by <see cref="PlcListener"/> on accept; attached
|
||||
/// to the multiplexer; runs its read loop until the upstream socket closes, the pipe is
|
||||
/// disposed, or the multiplexer cascades a backend disconnect.</para>
|
||||
///
|
||||
/// <para><b>Concurrency model:</b> each pipe runs exactly two tasks — a read task and a
|
||||
/// write task. The read task drives the multiplexer (one frame at a time, which preserves
|
||||
/// the per-upstream-client one-in-flight invariant); the write task drains
|
||||
/// <see cref="_responseChannel"/> and writes each frame to the socket. No third task ever
|
||||
/// touches the socket.</para>
|
||||
///
|
||||
/// <para><b>One-in-flight-per-upstream:</b> the read loop processes frames sequentially.
|
||||
/// A multi-PDU-pipelined client would still get correct service because the multiplexer
|
||||
/// can have multiple distinct <c>OnFrame</c> calls outstanding from <i>different</i>
|
||||
/// upstream pipes; a single upstream cannot multi-PDU-pipeline itself.</para>
|
||||
/// </summary>
|
||||
internal sealed partial class UpstreamPipe : IAsyncDisposable
|
||||
{
|
||||
// Capacity 16: enough to buffer responses while the upstream's TCP send buffer drains,
|
||||
// small enough that backpressure kicks in on a wedged consumer. Drop-on-fault behaviour
|
||||
// applies — if the upstream is dead, _alive flips to false and pending writes are
|
||||
// discarded by the multiplexer before they ever enter the channel.
|
||||
private const int ResponseChannelCapacity = 16;
|
||||
|
||||
private readonly Socket _upstream;
|
||||
private readonly ILogger _logger;
|
||||
private readonly string _plcName;
|
||||
|
||||
private readonly Channel<byte[]> _responseChannel = Channel.CreateBounded<byte[]>(
|
||||
new BoundedChannelOptions(ResponseChannelCapacity)
|
||||
{
|
||||
FullMode = BoundedChannelFullMode.Wait, // backpressure, not drop
|
||||
SingleReader = true,
|
||||
SingleWriter = false, // multiplexer adds; potential future paths too
|
||||
});
|
||||
|
||||
// Internal CTS lets the multiplexer signal "drop this pipe now" without waiting for
|
||||
// the upstream socket to close cleanly.
|
||||
private readonly CancellationTokenSource _cts = new();
|
||||
private bool _disposed;
|
||||
|
||||
// Phase 9: per-pipe forwarded-PDU counter (replaces the per-pair counter from the
|
||||
// 1:1 model). Read by the status page.
|
||||
private long _pdusForwardedCount;
|
||||
|
||||
/// <summary>Stable identity for status-page reporting and cascade cleanup.</summary>
|
||||
public Guid Id { get; } = Guid.NewGuid();
|
||||
|
||||
/// <summary>The upstream client's remote endpoint, captured at construction.</summary>
|
||||
public IPEndPoint? RemoteEp { get; }
|
||||
|
||||
/// <summary>UTC time at which the upstream socket was accepted.</summary>
|
||||
public DateTimeOffset ConnectedAtUtc { get; } = DateTimeOffset.UtcNow;
|
||||
|
||||
/// <summary>
|
||||
/// Number of request PDUs read from this upstream and forwarded into the multiplexer.
|
||||
/// Incremented by <see cref="RunReadLoopAsync"/> after each successful frame parse.
|
||||
/// </summary>
|
||||
public long PdusForwardedCount => Interlocked.Read(ref _pdusForwardedCount);
|
||||
|
||||
/// <summary>
|
||||
/// <c>true</c> while the pipe's read+write tasks are running. Flips to <c>false</c>
|
||||
/// on disposal or any fault on either direction.
|
||||
/// </summary>
|
||||
public bool IsAlive => !_disposed && !_cts.IsCancellationRequested;
|
||||
|
||||
public UpstreamPipe(Socket upstream, string plcName, ILogger logger)
|
||||
{
|
||||
_upstream = upstream;
|
||||
_upstream.NoDelay = true;
|
||||
RemoteEp = upstream.RemoteEndPoint as IPEndPoint;
|
||||
_plcName = plcName;
|
||||
_logger = logger;
|
||||
|
||||
string remoteStr = RemoteEp?.ToString() ?? "?";
|
||||
MultiplexerLogEvents.ClientConnected(_logger, _plcName, remoteStr);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Runs the read side of the pipe. Reads complete MBAP frames from the upstream
|
||||
/// socket and invokes <paramref name="onFrame"/> for each. Returns when:
|
||||
/// <list type="bullet">
|
||||
/// <item><description>The upstream closes cleanly (clean EOF on the first byte of a frame).</description></item>
|
||||
/// <item><description>The pipe is disposed (CTS fires).</description></item>
|
||||
/// <item><description>An exception is thrown by <paramref name="onFrame"/>.</description></item>
|
||||
/// </list>
|
||||
///
|
||||
/// <para>The frame buffer is owned by this loop; <paramref name="onFrame"/> receives
|
||||
/// a fresh <see cref="byte"/>[] each call (the multiplexer needs to retain a copy to
|
||||
/// build <see cref="InFlightRequest"/>, so we don't try to share the buffer).</para>
|
||||
/// </summary>
|
||||
public async Task RunReadLoopAsync(
|
||||
Func<byte[], CancellationToken, ValueTask> onFrame,
|
||||
CancellationToken ct)
|
||||
{
|
||||
using var linked = CancellationTokenSource.CreateLinkedTokenSource(ct, _cts.Token);
|
||||
var token = linked.Token;
|
||||
|
||||
// 7-byte header + max 253-byte PDU body = 260 bytes per frame.
|
||||
byte[] headerBuf = new byte[MbapFrame.HeaderSize];
|
||||
|
||||
try
|
||||
{
|
||||
while (!token.IsCancellationRequested)
|
||||
{
|
||||
// Read the 7-byte MBAP header.
|
||||
if (!await FillAsync(_upstream, headerBuf, 0, MbapFrame.HeaderSize, token).ConfigureAwait(false))
|
||||
return; // clean EOF — upstream went away.
|
||||
|
||||
if (!MbapFrame.TryParseHeader(headerBuf.AsSpan(),
|
||||
out _, out _, out ushort length, out _))
|
||||
return;
|
||||
|
||||
if (length < 1)
|
||||
{
|
||||
// Length field claims no body — forward the header alone via a fresh buffer.
|
||||
byte[] degenerate = new byte[MbapFrame.HeaderSize];
|
||||
Buffer.BlockCopy(headerBuf, 0, degenerate, 0, MbapFrame.HeaderSize);
|
||||
await onFrame(degenerate, token).ConfigureAwait(false);
|
||||
Interlocked.Increment(ref _pdusForwardedCount);
|
||||
continue;
|
||||
}
|
||||
|
||||
int pduBodyLen = length - 1;
|
||||
if (pduBodyLen > MbapFrame.MaxPduBodySize)
|
||||
{
|
||||
// Frame too large for the buffer — close the upstream.
|
||||
_logger.LogWarning(
|
||||
"Oversized upstream frame: Plc={Plc} PduBody={Body} > Max={Max}",
|
||||
_plcName, pduBodyLen, MbapFrame.MaxPduBodySize);
|
||||
return;
|
||||
}
|
||||
|
||||
// Allocate a fresh frame buffer per PDU; the multiplexer retains it.
|
||||
byte[] frame = new byte[MbapFrame.HeaderSize + pduBodyLen];
|
||||
Buffer.BlockCopy(headerBuf, 0, frame, 0, MbapFrame.HeaderSize);
|
||||
|
||||
if (!await FillAsync(_upstream, frame, MbapFrame.HeaderSize, pduBodyLen, token)
|
||||
.ConfigureAwait(false))
|
||||
return;
|
||||
|
||||
Interlocked.Increment(ref _pdusForwardedCount);
|
||||
await onFrame(frame, token).ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
catch (OperationCanceledException)
|
||||
{
|
||||
// Normal shutdown.
|
||||
}
|
||||
catch (SocketException)
|
||||
{
|
||||
// Upstream socket closed by remote end — normal.
|
||||
}
|
||||
catch (ObjectDisposedException)
|
||||
{
|
||||
// Socket disposed by write loop or DisposeAsync — normal.
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Runs the write side of the pipe. Drains <see cref="_responseChannel"/> and writes
|
||||
/// each frame to the upstream socket. Returns when the channel completes or the
|
||||
/// upstream socket fails.
|
||||
/// </summary>
|
||||
public async Task RunWriteLoopAsync(CancellationToken ct)
|
||||
{
|
||||
using var linked = CancellationTokenSource.CreateLinkedTokenSource(ct, _cts.Token);
|
||||
var token = linked.Token;
|
||||
|
||||
try
|
||||
{
|
||||
await foreach (var frame in _responseChannel.Reader.ReadAllAsync(token).ConfigureAwait(false))
|
||||
{
|
||||
await SendAllAsync(_upstream, frame.AsMemory(), token).ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
catch (OperationCanceledException)
|
||||
{
|
||||
// Normal shutdown.
|
||||
}
|
||||
catch (SocketException)
|
||||
{
|
||||
// Upstream remote closed — normal.
|
||||
}
|
||||
catch (ObjectDisposedException)
|
||||
{
|
||||
// Socket disposed elsewhere — normal.
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Enqueues <paramref name="frame"/> for delivery on the upstream socket. Returns
|
||||
/// without blocking when the pipe is no longer alive (the multiplexer will discover
|
||||
/// the dead pipe on its next correlation lookup and drop responses bound for it).
|
||||
/// </summary>
|
||||
public async ValueTask SendResponseAsync(byte[] frame, CancellationToken ct)
|
||||
{
|
||||
if (!IsAlive)
|
||||
return;
|
||||
|
||||
try
|
||||
{
|
||||
await _responseChannel.Writer.WriteAsync(frame, ct).ConfigureAwait(false);
|
||||
}
|
||||
catch (ChannelClosedException)
|
||||
{
|
||||
// Pipe disposed mid-write — drop silently.
|
||||
}
|
||||
catch (OperationCanceledException)
|
||||
{
|
||||
// Caller cancelled — drop silently.
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Closes the pipe: cancels the read+write loops and shuts down the socket. Idempotent.
|
||||
/// </summary>
|
||||
public async ValueTask DisposeAsync()
|
||||
{
|
||||
if (_disposed) return;
|
||||
_disposed = true;
|
||||
|
||||
try { _responseChannel.Writer.TryComplete(); } catch { /* already complete */ }
|
||||
|
||||
await _cts.CancelAsync().ConfigureAwait(false);
|
||||
|
||||
try { _upstream.Shutdown(SocketShutdown.Both); } catch { /* already closed */ }
|
||||
_upstream.Dispose();
|
||||
_cts.Dispose();
|
||||
|
||||
string remoteStr = RemoteEp?.ToString() ?? "?";
|
||||
MultiplexerLogEvents.ClientDisconnected(_logger, _plcName, remoteStr, "Pipe disposed");
|
||||
}
|
||||
|
||||
// ── Low-level I/O helpers ─────────────────────────────────────────────────────
|
||||
|
||||
private static async Task<bool> FillAsync(
|
||||
Socket socket, byte[] buf, int offset, int count, CancellationToken ct)
|
||||
{
|
||||
int remaining = count;
|
||||
bool firstRead = true;
|
||||
|
||||
while (remaining > 0)
|
||||
{
|
||||
int received = await socket.ReceiveAsync(
|
||||
buf.AsMemory(offset + (count - remaining), remaining),
|
||||
SocketFlags.None,
|
||||
ct).ConfigureAwait(false);
|
||||
|
||||
if (received == 0)
|
||||
return firstRead && remaining == count ? false : false;
|
||||
|
||||
remaining -= received;
|
||||
firstRead = false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private static async Task SendAllAsync(Socket socket, Memory<byte> memory, CancellationToken ct)
|
||||
{
|
||||
while (memory.Length > 0)
|
||||
{
|
||||
int sent = await socket.SendAsync(memory, SocketFlags.None, ct).ConfigureAwait(false);
|
||||
if (sent == 0) throw new SocketException((int)SocketError.ConnectionReset);
|
||||
memory = memory[sent..];
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,19 @@
|
||||
namespace Mbproxy.Proxy;
|
||||
|
||||
/// <summary>
|
||||
/// No-op PDU pipeline: passes every frame through byte-for-byte without rewriting.
|
||||
/// Registered as the <see cref="IPduPipeline"/> singleton in Phase 03.
|
||||
/// Phase 04 replaces this registration with BcdPduPipeline.
|
||||
/// </summary>
|
||||
internal sealed class NoopPduPipeline : IPduPipeline
|
||||
{
|
||||
public void Process(
|
||||
MbapDirection direction,
|
||||
ReadOnlySpan<byte> mbapHeader,
|
||||
Span<byte> pdu,
|
||||
PduContext context)
|
||||
{
|
||||
// Intentional no-op: bytes forwarded unmodified.
|
||||
// Phase 04: replace this registration with BcdPduPipeline.
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,60 @@
|
||||
using Mbproxy.Bcd;
|
||||
using Mbproxy.Proxy.Multiplexing;
|
||||
|
||||
namespace Mbproxy.Proxy;
|
||||
|
||||
/// <summary>
|
||||
/// Per-PLC context holding the resolved BCD tag map, live counters, and a logger.
|
||||
/// Derives from <see cref="PduContext"/> so it can be passed wherever a
|
||||
/// <see cref="PduContext"/> is expected.
|
||||
///
|
||||
/// One instance per configured PLC is constructed at <see cref="ProxyWorker"/> startup
|
||||
/// and lives for the lifetime of the listener. It is shared across all upstream pipes
|
||||
/// served by the same <see cref="Multiplexing.PlcMultiplexer"/>; all mutable state is
|
||||
/// accessed through <see cref="ProxyCounters"/> which uses Interlocked for thread-safety.
|
||||
///
|
||||
/// <para><b>Phase 9 — request correlation:</b> the multiplexer sets <see cref="CurrentRequest"/>
|
||||
/// before calling the pipeline on each direction. On the request path the pipeline can
|
||||
/// peek at the future correlation entry it just enqueued; on the response path the pipeline
|
||||
/// uses the request's <c>StartAddress</c>/<c>Qty</c> to decode FC03/FC04 BCD slots. Different
|
||||
/// in-flight responses use different <see cref="InFlightRequest"/> instances, so there is no
|
||||
/// cross-talk between concurrent multiplexed requests.</para>
|
||||
///
|
||||
/// <para><b>Concurrency:</b> a single <see cref="PerPlcContext"/> instance is shared across
|
||||
/// the per-upstream read tasks (which call the pipeline on the request path) and the
|
||||
/// single backend reader task (which calls the pipeline on the response path). Because the
|
||||
/// per-call <see cref="CurrentRequest"/> would be racy if mutated on the shared context,
|
||||
/// the multiplexer constructs a lightweight per-call clone (<see cref="WithCurrentRequest"/>)
|
||||
/// for each pipeline invocation. The shared mutable state — the tag map, counters, logger —
|
||||
/// is read-only or Interlocked.</para>
|
||||
/// </summary>
|
||||
internal class PerPlcContext : PduContext
|
||||
{
|
||||
public BcdTagMap TagMap { get; init; } = BcdTagMap.Empty;
|
||||
|
||||
public ProxyCounters Counters { get; init; } = new();
|
||||
|
||||
public ILogger Logger { get; init; } = Microsoft.Extensions.Logging.Abstractions.NullLogger.Instance;
|
||||
|
||||
/// <summary>
|
||||
/// Per-PDU-call correlation entry. Non-null on response calls (set by the multiplexer's
|
||||
/// backend reader task to the matched <see cref="InFlightRequest"/>); <c>null</c> on
|
||||
/// request calls. The BCD rewriter reads this on response to learn the originating
|
||||
/// FC03/FC04 start address and quantity (which are not present in the response PDU).
|
||||
/// </summary>
|
||||
internal InFlightRequest? CurrentRequest { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Returns a shallow clone of this context with <see cref="CurrentRequest"/> set to
|
||||
/// <paramref name="req"/>. The clone is cheap (one allocation per response) and avoids
|
||||
/// any race on the shared context across concurrent multiplexed responses.
|
||||
/// </summary>
|
||||
internal PerPlcContext WithCurrentRequest(InFlightRequest? req) => new()
|
||||
{
|
||||
PlcName = PlcName,
|
||||
TagMap = TagMap,
|
||||
Counters = Counters,
|
||||
Logger = Logger,
|
||||
CurrentRequest = req,
|
||||
};
|
||||
}
|
||||
@@ -0,0 +1,188 @@
|
||||
using System.Collections.Concurrent;
|
||||
using System.Net;
|
||||
using System.Net.Sockets;
|
||||
using Mbproxy.Options;
|
||||
using Mbproxy.Proxy.Multiplexing;
|
||||
using Polly;
|
||||
|
||||
namespace Mbproxy.Proxy;
|
||||
|
||||
/// <summary>
|
||||
/// Owns one <see cref="TcpListener"/> bound to a PLC's configured listen port and one
|
||||
/// <see cref="PlcMultiplexer"/> that owns the single backend connection to the PLC.
|
||||
///
|
||||
/// <para><b>Phase 9 — TxId multiplexing:</b> the listener no longer pairs each upstream
|
||||
/// socket with a dedicated backend socket. Instead, every accepted upstream is wrapped
|
||||
/// in an <see cref="UpstreamPipe"/> and handed to the multiplexer. The multiplexer holds
|
||||
/// at most one TCP connection to the PLC, eliminating the H2-ECOM100's 4-concurrent-client
|
||||
/// cap from the upstream side.</para>
|
||||
///
|
||||
/// <para>The listener's accept loop is otherwise unchanged. <see cref="StartAsync"/>
|
||||
/// binds the socket; <see cref="RunAsync"/> runs until cancelled or the listener faults;
|
||||
/// <see cref="DisposeAsync"/> tears down both the listener and the multiplexer.</para>
|
||||
/// </summary>
|
||||
internal sealed partial class PlcListener : IAsyncDisposable
|
||||
{
|
||||
private readonly PlcOptions _plc;
|
||||
private readonly ConnectionOptions _connectionOptions;
|
||||
private readonly IPduPipeline _pipeline;
|
||||
private readonly ILogger<PlcListener> _listenerLogger;
|
||||
private readonly ILogger<PlcMultiplexer> _multiplexerLogger;
|
||||
private readonly ILogger _pipeLogger;
|
||||
private readonly PerPlcContext? _perPlcContext;
|
||||
private readonly ResiliencePipeline? _backendConnectPipeline;
|
||||
|
||||
private TcpListener? _listener;
|
||||
private PlcMultiplexer? _multiplexer;
|
||||
private bool _disposed;
|
||||
|
||||
// Track active pipe-handling tasks so DisposeAsync can wait for them.
|
||||
private readonly ConcurrentDictionary<Guid, Task> _pipeTasks = new();
|
||||
|
||||
/// <summary>
|
||||
/// Live collection of active <see cref="UpstreamPipe"/> instances for this listener.
|
||||
/// Consumed by the status page to report per-client telemetry. Empty when the
|
||||
/// multiplexer has not yet been constructed (e.g., between StopAsync and a fresh start).
|
||||
/// </summary>
|
||||
public IReadOnlyCollection<UpstreamPipe> ActiveUpstreams
|
||||
=> _multiplexer?.AttachedPipes ?? Array.Empty<UpstreamPipe>();
|
||||
|
||||
public PlcListener(
|
||||
PlcOptions plc,
|
||||
ConnectionOptions connectionOptions,
|
||||
IPduPipeline pipeline,
|
||||
ILogger<PlcListener> listenerLogger,
|
||||
ILogger<PlcMultiplexer> multiplexerLogger,
|
||||
ILogger pipeLogger,
|
||||
PerPlcContext? perPlcContext = null,
|
||||
ResiliencePipeline? backendConnectPipeline = null)
|
||||
{
|
||||
_plc = plc;
|
||||
_connectionOptions = connectionOptions;
|
||||
_pipeline = pipeline;
|
||||
_listenerLogger = listenerLogger;
|
||||
_multiplexerLogger = multiplexerLogger;
|
||||
_pipeLogger = pipeLogger;
|
||||
_perPlcContext = perPlcContext;
|
||||
_backendConnectPipeline = backendConnectPipeline;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Binds the listen socket. Throws <see cref="SocketException"/> on bind failure;
|
||||
/// the caller (<see cref="Supervision.PlcListenerSupervisor"/>) catches and logs
|
||||
/// <c>mbproxy.startup.bind.failed</c>.
|
||||
/// </summary>
|
||||
public void StartAsync()
|
||||
{
|
||||
var endpoint = new IPEndPoint(IPAddress.Any, _plc.ListenPort);
|
||||
_listener = new TcpListener(endpoint);
|
||||
_listener.Start();
|
||||
LogBound(_listenerLogger, _plc.Name, _plc.ListenPort);
|
||||
|
||||
// The multiplexer needs a PerPlcContext to share the BCD tag map and counters with
|
||||
// the pipeline. If the caller (typically a test or pre-Phase-6 startup path) didn't
|
||||
// supply one, construct a minimal context that exposes only the PlcName so the
|
||||
// multiplexer + a noop/passthrough pipeline still round-trip frames correctly.
|
||||
var ctx = _perPlcContext ?? new PerPlcContext
|
||||
{
|
||||
PlcName = _plc.Name,
|
||||
Logger = _pipeLogger,
|
||||
};
|
||||
_multiplexer = new PlcMultiplexer(
|
||||
_plc,
|
||||
_connectionOptions,
|
||||
_pipeline,
|
||||
ctx,
|
||||
_multiplexerLogger,
|
||||
_backendConnectPipeline);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Runs the accept loop until <paramref name="ct"/> is cancelled or the listener
|
||||
/// faults. On accept, wraps the socket in an <see cref="UpstreamPipe"/> and attaches
|
||||
/// it to the multiplexer.
|
||||
/// </summary>
|
||||
public async Task RunAsync(CancellationToken ct)
|
||||
{
|
||||
if (_listener is null)
|
||||
throw new InvalidOperationException("StartAsync must be called before RunAsync.");
|
||||
|
||||
if (_multiplexer is null)
|
||||
throw new InvalidOperationException("StartAsync must construct the multiplexer before RunAsync.");
|
||||
|
||||
try
|
||||
{
|
||||
while (!ct.IsCancellationRequested)
|
||||
{
|
||||
Socket upstream = await _listener.AcceptSocketAsync(ct).ConfigureAwait(false);
|
||||
|
||||
var pipe = new UpstreamPipe(upstream, _plc.Name, _pipeLogger);
|
||||
var pipeTask = Task.Run(async () =>
|
||||
{
|
||||
try
|
||||
{
|
||||
await _multiplexer.StartPipeAsync(pipe, ct).ConfigureAwait(false);
|
||||
}
|
||||
finally
|
||||
{
|
||||
await pipe.DisposeAsync().ConfigureAwait(false);
|
||||
}
|
||||
}, CancellationToken.None);
|
||||
|
||||
_pipeTasks[pipe.Id] = pipeTask;
|
||||
_ = pipeTask.ContinueWith(prev => _pipeTasks.TryRemove(pipe.Id, out _), TaskScheduler.Default);
|
||||
}
|
||||
}
|
||||
catch (OperationCanceledException)
|
||||
{
|
||||
// Normal shutdown.
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
// Listener faulted — log and return. The supervisor will restart.
|
||||
LogListenerFaulted(_listenerLogger, _plc.Name, _plc.ListenPort, ex.Message);
|
||||
}
|
||||
}
|
||||
|
||||
// ── IAsyncDisposable ──────────────────────────────────────────────────────────────────
|
||||
|
||||
public async ValueTask DisposeAsync()
|
||||
{
|
||||
if (_disposed) return;
|
||||
_disposed = true;
|
||||
|
||||
_listener?.Stop();
|
||||
|
||||
if (_multiplexer is not null)
|
||||
{
|
||||
await _multiplexer.DisposeAsync().ConfigureAwait(false);
|
||||
_multiplexer = null;
|
||||
}
|
||||
|
||||
Task[] snapshot = _pipeTasks.Values.ToArray();
|
||||
if (snapshot.Length > 0)
|
||||
{
|
||||
using var timeout = new CancellationTokenSource(TimeSpan.FromSeconds(5));
|
||||
try
|
||||
{
|
||||
await Task.WhenAll(snapshot)
|
||||
.WaitAsync(timeout.Token)
|
||||
.ConfigureAwait(false);
|
||||
}
|
||||
catch
|
||||
{
|
||||
// Best effort.
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ── Logging ───────────────────────────────────────────────────────────────────────────
|
||||
|
||||
[LoggerMessage(EventId = 20, EventName = "mbproxy.startup.bind",
|
||||
Level = LogLevel.Information, Message = "Listener bound: Plc={Plc} Port={Port}")]
|
||||
private static partial void LogBound(ILogger logger, string plc, int port);
|
||||
|
||||
[LoggerMessage(EventId = 22, EventName = "mbproxy.listener.faulted",
|
||||
Level = LogLevel.Error, Message = "Listener faulted: Plc={Plc} Port={Port} Reason={Reason}")]
|
||||
private static partial void LogListenerFaulted(ILogger logger, string plc, int port, string reason);
|
||||
}
|
||||
@@ -0,0 +1,336 @@
|
||||
namespace Mbproxy.Proxy;
|
||||
|
||||
/// <summary>
|
||||
/// Immutable snapshot of per-PLC counters. Consumed by Phase 07's status page.
|
||||
/// All fields are point-in-time reads; no ordering guarantees across fields.
|
||||
///
|
||||
/// <para><b>Backwards-compat policy (see docs/kpi.md):</b> fields are <i>added</i>, never
|
||||
/// renamed or removed. Phase 9 appended <c>InFlightCount</c>, <c>MaxInFlight</c>,
|
||||
/// <c>TxIdWraps</c>, <c>BackendDisconnectCascades</c>, and <c>BackendQueueDepth</c> for
|
||||
/// the TxId-multiplexer telemetry surface (Tier 1.6 in docs/kpi.md).</para>
|
||||
/// </summary>
|
||||
public sealed record CounterSnapshot(
|
||||
long PdusForwarded,
|
||||
long Fc03,
|
||||
long Fc04,
|
||||
long Fc06,
|
||||
long Fc16,
|
||||
long FcOther,
|
||||
long RewrittenSlots,
|
||||
long PartialBcdWarnings,
|
||||
long InvalidBcdWarnings,
|
||||
long BackendException01,
|
||||
long BackendException02,
|
||||
long BackendException03,
|
||||
long BackendException04,
|
||||
long BackendExceptionOther,
|
||||
long BytesUpstreamIn,
|
||||
long BytesUpstreamOut,
|
||||
/// <summary>
|
||||
/// Total number of failed listener bind attempts over the lifetime of the supervisor.
|
||||
/// Accumulates; never resets. See <see cref="SupervisorSnapshot.RecoveryAttempts"/> doc.
|
||||
/// </summary>
|
||||
long RecoveryAttempts,
|
||||
/// <summary>
|
||||
/// Most recent bind failure message (up to 256 chars); <c>null</c> if the listener
|
||||
/// has never failed to bind.
|
||||
/// </summary>
|
||||
string? LastBindError,
|
||||
/// <summary>
|
||||
/// EWMA of recent backend round-trip times in milliseconds (α = 0.2).
|
||||
/// Zero when no successful round-trips have been observed yet.
|
||||
/// Stored internally as fixed-point microseconds (long * 1000) for Interlocked
|
||||
/// compatibility; converted to double ms on snapshot.
|
||||
/// </summary>
|
||||
double LastRoundTripMs,
|
||||
/// <summary>
|
||||
/// Number of backend connections successfully established (Polly final success).
|
||||
/// </summary>
|
||||
long ConnectsSuccess,
|
||||
/// <summary>
|
||||
/// Number of backend connections that failed on all Polly attempts.
|
||||
/// </summary>
|
||||
long ConnectsFailed,
|
||||
/// <summary>
|
||||
/// Number of Modbus requests currently in flight on this PLC's multiplexed backend
|
||||
/// connection (point-in-time snapshot of the correlation map size). Phase 9.
|
||||
/// </summary>
|
||||
long InFlightCount,
|
||||
/// <summary>
|
||||
/// Peak <see cref="InFlightCount"/> observed since the multiplexer was constructed.
|
||||
/// Updated via <see cref="Interlocked"/> CAS so concurrent in-flight increments do not
|
||||
/// lose the high-water mark. Phase 9.
|
||||
/// </summary>
|
||||
long MaxInFlight,
|
||||
/// <summary>
|
||||
/// Number of times the per-PLC TxId allocator's rolling cursor has wrapped
|
||||
/// 0xFFFF → 0x0000. A non-zero value is benign; a sudden burst suggests extreme
|
||||
/// in-flight churn. Phase 9.
|
||||
/// </summary>
|
||||
long TxIdWraps,
|
||||
/// <summary>
|
||||
/// Cumulative count of upstream pipes closed as a side effect of a backend disconnect.
|
||||
/// Each backend reconnect cycle adds the number of attached upstream clients at the
|
||||
/// time of the disconnect. Phase 9.
|
||||
/// </summary>
|
||||
long BackendDisconnectCascades,
|
||||
/// <summary>
|
||||
/// Current depth of the per-PLC outbound channel feeding the backend writer task
|
||||
/// (frames queued, not yet on the wire). A sustained non-zero value indicates the
|
||||
/// backend is slower than upstream demand. Phase 9.
|
||||
/// </summary>
|
||||
long BackendQueueDepth);
|
||||
|
||||
/// <summary>
|
||||
/// Thread-safe per-PLC counters backed by <see cref="System.Threading.Interlocked"/> longs.
|
||||
/// All increment methods are allocation-free (no boxing, no heap traffic on the hot path).
|
||||
/// <see cref="Snapshot"/> may allocate (record construction) — it is off-path (status page only).
|
||||
/// </summary>
|
||||
internal sealed class ProxyCounters
|
||||
{
|
||||
// ── Hot-path fields (Interlocked longs) ─────────────────────────────────
|
||||
|
||||
private long _pdusForwarded;
|
||||
private long _fc03;
|
||||
private long _fc04;
|
||||
private long _fc06;
|
||||
private long _fc16;
|
||||
private long _fcOther;
|
||||
private long _rewrittenSlots;
|
||||
private long _partialBcdWarnings;
|
||||
private long _invalidBcdWarnings;
|
||||
private long _backendException01;
|
||||
private long _backendException02;
|
||||
private long _backendException03;
|
||||
private long _backendException04;
|
||||
private long _backendExceptionOther;
|
||||
private long _bytesUpstreamIn;
|
||||
private long _bytesUpstreamOut;
|
||||
private long _recoveryAttempts;
|
||||
private long _connectsSuccess;
|
||||
private long _connectsFailed;
|
||||
|
||||
// Phase 9 multiplexer telemetry.
|
||||
private long _maxInFlight;
|
||||
private long _backendDisconnectCascades;
|
||||
|
||||
// Phase 9: live state pulled from the multiplexer's allocator/map/queue on each
|
||||
// snapshot. The multiplexer registers a single provider via SetMultiplexProvider.
|
||||
// We use a volatile reference for lock-free read on the snapshot path.
|
||||
private volatile IMultiplexCountersProvider? _multiplexProvider;
|
||||
// LastBindError is a string (not a long); accessed via volatile field on ProxyCounters
|
||||
// but actually stored on the supervisor. We expose it here for snapshot parity.
|
||||
// Supervisor sets this via SetLastBindError; Snapshot reads it.
|
||||
private volatile string? _lastBindError;
|
||||
|
||||
// EWMA round-trip: stored as fixed-point microseconds (value * 1000) so we can use
|
||||
// Interlocked.CompareExchange on a long. The EWMA smoothing factor α = 0.2 gives a
|
||||
// half-life of ~3 samples (responds quickly to changes without being noisy).
|
||||
// Updated by PlcMultiplexer on each successful response (request→response round-trip,
|
||||
// measured against InFlightRequest.SentAtUtc).
|
||||
// 0 = no samples observed yet.
|
||||
private long _lastRoundTripUsEwma; // fixed-point microseconds
|
||||
|
||||
// ── Increment methods ────────────────────────────────────────────────────
|
||||
|
||||
public void IncrementPdusForwarded()
|
||||
=> Interlocked.Increment(ref _pdusForwarded);
|
||||
|
||||
public void IncrementFcCount(byte fc)
|
||||
{
|
||||
switch (fc)
|
||||
{
|
||||
case 0x03: Interlocked.Increment(ref _fc03); break;
|
||||
case 0x04: Interlocked.Increment(ref _fc04); break;
|
||||
case 0x06: Interlocked.Increment(ref _fc06); break;
|
||||
case 0x10: Interlocked.Increment(ref _fc16); break;
|
||||
default: Interlocked.Increment(ref _fcOther); break;
|
||||
}
|
||||
}
|
||||
|
||||
public void AddRewrittenSlots(int n)
|
||||
=> Interlocked.Add(ref _rewrittenSlots, n);
|
||||
|
||||
public void IncrementPartialBcd()
|
||||
=> Interlocked.Increment(ref _partialBcdWarnings);
|
||||
|
||||
public void IncrementInvalidBcd()
|
||||
=> Interlocked.Increment(ref _invalidBcdWarnings);
|
||||
|
||||
/// <summary>
|
||||
/// Increments the backend-exception counter for the given Modbus exception code.
|
||||
/// Codes 1–4 map to individual counters; anything else goes to "Other".
|
||||
/// </summary>
|
||||
public void IncrementBackendException(byte code)
|
||||
{
|
||||
switch (code)
|
||||
{
|
||||
case 1: Interlocked.Increment(ref _backendException01); break;
|
||||
case 2: Interlocked.Increment(ref _backendException02); break;
|
||||
case 3: Interlocked.Increment(ref _backendException03); break;
|
||||
case 4: Interlocked.Increment(ref _backendException04); break;
|
||||
default: Interlocked.Increment(ref _backendExceptionOther); break;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Adds byte counts for both upstream directions atomically.
|
||||
/// </summary>
|
||||
public void AddBytes(long up, long down)
|
||||
{
|
||||
Interlocked.Add(ref _bytesUpstreamIn, up);
|
||||
Interlocked.Add(ref _bytesUpstreamOut, down);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Records one successful backend TCP connect (Polly pipeline returned success).
|
||||
/// </summary>
|
||||
public void IncrementConnectSuccess()
|
||||
=> Interlocked.Increment(ref _connectsSuccess);
|
||||
|
||||
/// <summary>
|
||||
/// Records one failed backend TCP connect (all Polly attempts exhausted).
|
||||
/// </summary>
|
||||
public void IncrementConnectFailed()
|
||||
=> Interlocked.Increment(ref _connectsFailed);
|
||||
|
||||
/// <summary>
|
||||
/// Records <paramref name="n"/> upstream pipes closed by a backend disconnect cascade.
|
||||
/// Phase 9.
|
||||
/// </summary>
|
||||
public void AddDisconnectCascades(int n)
|
||||
=> Interlocked.Add(ref _backendDisconnectCascades, n);
|
||||
|
||||
/// <summary>
|
||||
/// CAS-updates the peak in-flight high-water mark. Called on every successful
|
||||
/// allocation by the multiplexer. Phase 9.
|
||||
/// </summary>
|
||||
public void ObserveInFlight(int currentInFlight)
|
||||
{
|
||||
long sample = currentInFlight;
|
||||
long old;
|
||||
do
|
||||
{
|
||||
old = Interlocked.Read(ref _maxInFlight);
|
||||
if (sample <= old) return;
|
||||
}
|
||||
while (Interlocked.CompareExchange(ref _maxInFlight, sample, old) != old);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Wires the live multiplexer telemetry source into this counter set. Called by
|
||||
/// <see cref="Mbproxy.Proxy.Multiplexing.PlcMultiplexer"/> at construction time so
|
||||
/// the status page's <see cref="Snapshot"/> can include live in-flight / queue-depth
|
||||
/// values without polling the multiplexer separately. Phase 9.
|
||||
/// </summary>
|
||||
internal void SetMultiplexProvider(IMultiplexCountersProvider? provider)
|
||||
=> _multiplexProvider = provider;
|
||||
|
||||
/// <summary>
|
||||
/// Increments the recovery-attempt counter and records the bind error message
|
||||
/// (truncated to 256 chars). Called by the supervisor on each failed bind.
|
||||
/// </summary>
|
||||
public void IncrementRecoveryAttempt(string errorMessage)
|
||||
{
|
||||
Interlocked.Increment(ref _recoveryAttempts);
|
||||
_lastBindError = errorMessage.Length > 256 ? errorMessage[..256] : errorMessage;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Clears the last bind error after a successful bind.
|
||||
/// </summary>
|
||||
public void ClearLastBindError()
|
||||
{
|
||||
_lastBindError = null;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Updates the EWMA round-trip estimate with a new sample.
|
||||
/// Uses α = 0.2: new_ewma = 0.2 * sample + 0.8 * old_ewma.
|
||||
/// <paramref name="elapsedTicks"/> is from <see cref="System.Diagnostics.Stopwatch.GetTimestamp"/>.
|
||||
/// Thread-safe via CAS loop on a fixed-point microsecond long.
|
||||
/// </summary>
|
||||
public void UpdateRoundTripEwma(long elapsedTicks)
|
||||
{
|
||||
// Convert ticks to microseconds.
|
||||
double sampleMs = (double)elapsedTicks / System.Diagnostics.Stopwatch.Frequency * 1000.0;
|
||||
|
||||
// Fixed-point: store microseconds * 1000 (i.e. nanoseconds) as long for CAS.
|
||||
// This gives ~1 µs resolution which is fine for Modbus round-trips (1–100 ms range).
|
||||
long sampleFixed = (long)(sampleMs * 1000.0);
|
||||
|
||||
long old, newVal;
|
||||
do
|
||||
{
|
||||
old = Interlocked.Read(ref _lastRoundTripUsEwma);
|
||||
// If no previous sample, seed with first sample; otherwise apply EWMA.
|
||||
newVal = old == 0
|
||||
? sampleFixed
|
||||
: (long)(0.2 * sampleFixed + 0.8 * old);
|
||||
}
|
||||
while (Interlocked.CompareExchange(ref _lastRoundTripUsEwma, newVal, old) != old);
|
||||
}
|
||||
|
||||
// ── Snapshot (off hot-path, may allocate) ────────────────────────────────
|
||||
|
||||
/// <summary>
|
||||
/// Returns a point-in-time snapshot of all counters.
|
||||
/// Each field is read atomically via <see cref="Interlocked.Read"/>.
|
||||
/// May allocate (record construction); intended for the status-page path only.
|
||||
/// </summary>
|
||||
public CounterSnapshot Snapshot()
|
||||
{
|
||||
var provider = _multiplexProvider;
|
||||
long inFlightNow = provider?.InFlightCount ?? 0;
|
||||
long txWraps = provider?.TxIdWraps ?? 0;
|
||||
long queueDepth = provider?.BackendQueueDepth ?? 0;
|
||||
|
||||
return new(
|
||||
PdusForwarded: Interlocked.Read(ref _pdusForwarded),
|
||||
Fc03: Interlocked.Read(ref _fc03),
|
||||
Fc04: Interlocked.Read(ref _fc04),
|
||||
Fc06: Interlocked.Read(ref _fc06),
|
||||
Fc16: Interlocked.Read(ref _fc16),
|
||||
FcOther: Interlocked.Read(ref _fcOther),
|
||||
RewrittenSlots: Interlocked.Read(ref _rewrittenSlots),
|
||||
PartialBcdWarnings: Interlocked.Read(ref _partialBcdWarnings),
|
||||
InvalidBcdWarnings: Interlocked.Read(ref _invalidBcdWarnings),
|
||||
BackendException01: Interlocked.Read(ref _backendException01),
|
||||
BackendException02: Interlocked.Read(ref _backendException02),
|
||||
BackendException03: Interlocked.Read(ref _backendException03),
|
||||
BackendException04: Interlocked.Read(ref _backendException04),
|
||||
BackendExceptionOther: Interlocked.Read(ref _backendExceptionOther),
|
||||
BytesUpstreamIn: Interlocked.Read(ref _bytesUpstreamIn),
|
||||
BytesUpstreamOut: Interlocked.Read(ref _bytesUpstreamOut),
|
||||
RecoveryAttempts: Interlocked.Read(ref _recoveryAttempts),
|
||||
LastBindError: _lastBindError,
|
||||
LastRoundTripMs: Interlocked.Read(ref _lastRoundTripUsEwma) / 1000.0,
|
||||
ConnectsSuccess: Interlocked.Read(ref _connectsSuccess),
|
||||
ConnectsFailed: Interlocked.Read(ref _connectsFailed),
|
||||
InFlightCount: inFlightNow,
|
||||
MaxInFlight: Interlocked.Read(ref _maxInFlight),
|
||||
TxIdWraps: txWraps,
|
||||
BackendDisconnectCascades: Interlocked.Read(ref _backendDisconnectCascades),
|
||||
BackendQueueDepth: queueDepth);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Read-only window into the per-PLC multiplexer's live state (allocator counts,
|
||||
/// outbound-queue depth). Implemented by <see cref="Mbproxy.Proxy.Multiplexing.PlcMultiplexer"/>
|
||||
/// and registered with <see cref="ProxyCounters.SetMultiplexProvider"/> so
|
||||
/// <see cref="ProxyCounters.Snapshot"/> can include live mux telemetry without holding
|
||||
/// a direct reference to the multiplexer (which would couple counter snapshots to the
|
||||
/// connection layer's lifecycle). Phase 9.
|
||||
/// </summary>
|
||||
internal interface IMultiplexCountersProvider
|
||||
{
|
||||
/// <summary>Number of currently-in-flight requests on the backend socket.</summary>
|
||||
long InFlightCount { get; }
|
||||
|
||||
/// <summary>Cumulative 0xFFFF → 0x0000 wrap events from the TxId allocator.</summary>
|
||||
long TxIdWraps { get; }
|
||||
|
||||
/// <summary>Current depth of the outbound channel (frames queued for the backend writer).</summary>
|
||||
long BackendQueueDepth { get; }
|
||||
}
|
||||
@@ -0,0 +1,218 @@
|
||||
using Mbproxy.Bcd;
|
||||
using Mbproxy.Configuration;
|
||||
using Mbproxy.Options;
|
||||
using Mbproxy.Proxy.Multiplexing;
|
||||
using Mbproxy.Proxy.Supervision;
|
||||
using Microsoft.Extensions.Options;
|
||||
using Polly;
|
||||
|
||||
namespace Mbproxy.Proxy;
|
||||
|
||||
/// <summary>
|
||||
/// <see cref="BackgroundService"/> that owns all <see cref="PlcListenerSupervisor"/> instances.
|
||||
///
|
||||
/// Startup posture (matches design doc "eager, continue on per-port failure"):
|
||||
/// <list type="number">
|
||||
/// <item>Enumerate <see cref="MbproxyOptions.Plcs"/> and build one supervisor per PLC.</item>
|
||||
/// <item>Start all supervisors in parallel. Each supervisor attempts to bind immediately
|
||||
/// and enters the Polly recovery loop if the bind fails.</item>
|
||||
/// <item>After all supervisors have completed their first bind attempt (reached
|
||||
/// <see cref="SupervisorState.Bound"/> or <see cref="SupervisorState.Recovering"/>),
|
||||
/// log <c>mbproxy.startup.ready</c> with bound/configured counts.</item>
|
||||
/// </list>
|
||||
///
|
||||
/// Phase 06: passes the supervisor dictionary to <see cref="ConfigReconciler.Attach"/>
|
||||
/// after initial startup so hot-reload changes are applied by the reconciler.
|
||||
///
|
||||
/// Stop: cancels all supervisors in parallel with a 5-second hard deadline.
|
||||
/// </summary>
|
||||
internal sealed partial class ProxyWorker : BackgroundService
|
||||
{
|
||||
private readonly IOptionsMonitor<MbproxyOptions> _options;
|
||||
private readonly IPduPipeline _pipeline;
|
||||
private readonly ILogger<ProxyWorker> _logger;
|
||||
private readonly ILoggerFactory _loggerFactory;
|
||||
private readonly ConfigReconciler _reconciler;
|
||||
|
||||
// Phase 06: supervisors are now managed jointly by ProxyWorker (initial bootstrap)
|
||||
// and ConfigReconciler (subsequent hot-reload changes). The dictionary is shared
|
||||
// via ConfigReconciler.Attach() after initial startup.
|
||||
private readonly Dictionary<string, PlcListenerSupervisor> _supervisors = new(StringComparer.Ordinal);
|
||||
|
||||
/// <summary>
|
||||
/// Read-only view of the live supervisor dictionary. Consumed by Phase 07's
|
||||
/// <see cref="Admin.StatusSnapshotBuilder"/> to enumerate per-PLC state.
|
||||
/// The caller should read this on the status-page path only (not the hot path).
|
||||
/// </summary>
|
||||
internal IReadOnlyDictionary<string, PlcListenerSupervisor> Supervisors => _supervisors;
|
||||
|
||||
public ProxyWorker(
|
||||
IOptionsMonitor<MbproxyOptions> options,
|
||||
IPduPipeline pipeline,
|
||||
ILogger<ProxyWorker> logger,
|
||||
ILoggerFactory loggerFactory,
|
||||
ConfigReconciler reconciler)
|
||||
{
|
||||
_options = options;
|
||||
_pipeline = pipeline;
|
||||
_logger = logger;
|
||||
_loggerFactory = loggerFactory;
|
||||
_reconciler = reconciler;
|
||||
}
|
||||
|
||||
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
|
||||
{
|
||||
var opts = _options.CurrentValue;
|
||||
int plcsConfigured = opts.Plcs.Count;
|
||||
|
||||
// ── 1. Build per-PLC BCD tag maps ────────────────────────────────────────────
|
||||
var plcContexts = new Dictionary<string, PerPlcContext>(opts.Plcs.Count, StringComparer.Ordinal);
|
||||
|
||||
foreach (var plc in opts.Plcs)
|
||||
{
|
||||
var result = BcdTagMapBuilder.Build(opts.BcdTags, plc.BcdTags);
|
||||
|
||||
foreach (var warn in result.Warnings)
|
||||
_logger.LogWarning("[{Plc}] BCD tag map warning: {Message}", plc.Name, warn.Message);
|
||||
|
||||
if (result.Errors.Count > 0)
|
||||
{
|
||||
foreach (var err in result.Errors)
|
||||
_logger.LogError("[{Plc}] BCD tag map error ({Kind}): {Message}",
|
||||
plc.Name, err.Kind, err.Message);
|
||||
|
||||
_logger.LogError("Skipping listener for PLC '{Plc}' due to BCD tag map errors.", plc.Name);
|
||||
continue;
|
||||
}
|
||||
|
||||
plcContexts[plc.Name] = new PerPlcContext
|
||||
{
|
||||
PlcName = plc.Name,
|
||||
TagMap = result.Map,
|
||||
Counters = new ProxyCounters(),
|
||||
Logger = _loggerFactory.CreateLogger($"Mbproxy.Proxy.BcdRewriter.{plc.Name}"),
|
||||
};
|
||||
}
|
||||
|
||||
// ── 2. Build Polly pipelines once ─────────────────────────────────────────────
|
||||
// Both pipelines are built from ResilienceOptions and reused across all PLCs.
|
||||
var resilienceOpts = opts.Resilience;
|
||||
var backendPipeline = PolicyFactory.BuildBackendConnect(
|
||||
resilienceOpts.BackendConnect,
|
||||
_loggerFactory.CreateLogger("Mbproxy.Proxy.BackendConnect"));
|
||||
|
||||
// ── 3. Build supervisors ──────────────────────────────────────────────────────
|
||||
foreach (var plc in opts.Plcs)
|
||||
{
|
||||
if (!plcContexts.TryGetValue(plc.Name, out var perPlcContext))
|
||||
continue; // BCD map failed — skip this PLC.
|
||||
|
||||
// Each supervisor gets its own recovery pipeline (with its own logger scope).
|
||||
var recoveryPipeline = PolicyFactory.BuildListenerRecovery(
|
||||
resilienceOpts.ListenerRecovery,
|
||||
_loggerFactory.CreateLogger($"Mbproxy.Proxy.ListenerRecovery.{plc.Name}"));
|
||||
|
||||
var supervisor = new PlcListenerSupervisor(
|
||||
plc,
|
||||
opts.Connection,
|
||||
_pipeline,
|
||||
_loggerFactory.CreateLogger<PlcListener>(),
|
||||
_loggerFactory.CreateLogger<PlcMultiplexer>(),
|
||||
_loggerFactory.CreateLogger($"Mbproxy.Proxy.UpstreamPipe.{plc.Name}"),
|
||||
perPlcContext,
|
||||
recoveryPipeline,
|
||||
_loggerFactory.CreateLogger<PlcListenerSupervisor>(),
|
||||
backendPipeline);
|
||||
|
||||
_supervisors[plc.Name] = supervisor;
|
||||
}
|
||||
|
||||
// ── Phase 06: wire reconciler BEFORE starting supervisors ─────────────────
|
||||
// Attach hands the reconciler the authoritative supervisor dictionary and the
|
||||
// initial options snapshot. The reconciler won't process OnChange events until
|
||||
// after this call — the brief window between Attach and first supervisor start
|
||||
// is safe because the channel signal only enqueues; apply runs asynchronously.
|
||||
_reconciler.Attach(_supervisors, opts);
|
||||
|
||||
if (_supervisors.Count == 0)
|
||||
{
|
||||
LogStartupReady(_logger, 0, plcsConfigured);
|
||||
await Task.Delay(Timeout.Infinite, stoppingToken).ConfigureAwait(false);
|
||||
return;
|
||||
}
|
||||
|
||||
// ── 4. Start all supervisors in parallel ──────────────────────────────────────
|
||||
var startTasks = _supervisors.Values
|
||||
.Select(s => s.StartAsync(stoppingToken))
|
||||
.ToArray();
|
||||
await Task.WhenAll(startTasks).ConfigureAwait(false);
|
||||
|
||||
// ── 5. Wait for every supervisor to complete its first bind attempt ───────────
|
||||
// "Ready" = every supervisor has transitioned out of Stopped (i.e. reached
|
||||
// Bound or Recovering from its first attempt).
|
||||
using var readyCts = new CancellationTokenSource(TimeSpan.FromSeconds(30));
|
||||
using var readyLinked = CancellationTokenSource.CreateLinkedTokenSource(
|
||||
readyCts.Token, stoppingToken);
|
||||
|
||||
var waitTasks = _supervisors.Values
|
||||
.Select(s => s.WaitForInitialBindAttemptAsync(readyLinked.Token))
|
||||
.ToArray();
|
||||
|
||||
try
|
||||
{
|
||||
await Task.WhenAll(waitTasks).ConfigureAwait(false);
|
||||
}
|
||||
catch (OperationCanceledException)
|
||||
{
|
||||
// Either the 30 s deadline fired or the service is stopping.
|
||||
}
|
||||
|
||||
int boundCount = _supervisors.Values.Count(s => s.Snapshot().State == SupervisorState.Bound);
|
||||
LogStartupReady(_logger, boundCount, plcsConfigured);
|
||||
|
||||
// ── 6. Keep the worker alive until the host signals stop ─────────────────────
|
||||
// Supervisors run their own background loops; ExecuteAsync just waits.
|
||||
await Task.Delay(Timeout.Infinite, stoppingToken).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
public override async Task StopAsync(CancellationToken cancellationToken)
|
||||
{
|
||||
// Cancel ExecuteAsync first.
|
||||
await base.StopAsync(cancellationToken).ConfigureAwait(false);
|
||||
|
||||
// Stop all supervisors in parallel with a 5-second hard deadline.
|
||||
using var stopCts = new CancellationTokenSource(TimeSpan.FromSeconds(5));
|
||||
using var linked = CancellationTokenSource.CreateLinkedTokenSource(
|
||||
stopCts.Token, cancellationToken);
|
||||
|
||||
var stopTasks = _supervisors.Values
|
||||
.Select(s => s.StopAsync(linked.Token))
|
||||
.ToArray();
|
||||
|
||||
try
|
||||
{
|
||||
await Task.WhenAll(stopTasks).ConfigureAwait(false);
|
||||
}
|
||||
catch
|
||||
{
|
||||
// Best effort — don't let individual supervisor failures block shutdown.
|
||||
}
|
||||
|
||||
foreach (var supervisor in _supervisors.Values)
|
||||
await supervisor.DisposeAsync().ConfigureAwait(false);
|
||||
|
||||
_supervisors.Clear();
|
||||
}
|
||||
|
||||
// ── Logging ───────────────────────────────────────────────────────────────────────────
|
||||
|
||||
[LoggerMessage(EventId = 1, EventName = "mbproxy.startup.ready",
|
||||
Level = LogLevel.Information,
|
||||
Message = "mbproxy service ready — ListenersBound={ListenersBound} PlcsConfigured={PlcsConfigured}")]
|
||||
private static partial void LogStartupReady(ILogger logger, int listenersBound, int plcsConfigured);
|
||||
|
||||
[LoggerMessage(EventId = 21, EventName = "mbproxy.startup.bind.failed",
|
||||
Level = LogLevel.Error,
|
||||
Message = "Failed to bind listener: Plc={Plc} Port={Port} Reason={Reason}")]
|
||||
private static partial void LogBindFailed(ILogger logger, string plc, int port, string reason);
|
||||
}
|
||||
@@ -0,0 +1,56 @@
|
||||
namespace Mbproxy.Proxy;
|
||||
|
||||
/// <summary>
|
||||
/// Source-generated <see cref="LoggerMessage"/> definitions for the BCD rewriter pipeline.
|
||||
/// All event names are stable — do not rename without updating docs/design.md.
|
||||
/// </summary>
|
||||
internal static partial class RewriterLogEvents
|
||||
{
|
||||
/// <summary>
|
||||
/// Emitted when a 32-bit BCD pair is only partially covered by the read/write range.
|
||||
/// The raw bytes are passed through unchanged; the client or PLC sees the original nibbles.
|
||||
/// </summary>
|
||||
[LoggerMessage(
|
||||
EventId = 30,
|
||||
EventName = "mbproxy.rewrite.partial_bcd",
|
||||
Level = LogLevel.Warning,
|
||||
Message = "Partial BCD overlap — passing through raw: Plc={PlcName} Address={Address} ClientStart={ClientStart} ClientQty={ClientQty}")]
|
||||
public static partial void PartialBcd(
|
||||
ILogger logger,
|
||||
string plcName,
|
||||
ushort address,
|
||||
ushort clientStart,
|
||||
ushort clientQty);
|
||||
|
||||
/// <summary>
|
||||
/// Emitted when a register value at a configured BCD address contains a nibble >= 0xA
|
||||
/// (i.e. not a valid BCD digit). The raw bytes are passed through unchanged.
|
||||
/// Direction is "Read" (response from PLC) or "Write" (request from client).
|
||||
/// </summary>
|
||||
[LoggerMessage(
|
||||
EventId = 31,
|
||||
EventName = "mbproxy.rewrite.invalid_bcd",
|
||||
Level = LogLevel.Warning,
|
||||
Message = "Invalid BCD nibble — passing through raw: Plc={PlcName} Address={Address} RawValue=0x{RawValue:X4} Direction={Direction}")]
|
||||
public static partial void InvalidBcd(
|
||||
ILogger logger,
|
||||
string plcName,
|
||||
ushort address,
|
||||
ushort rawValue,
|
||||
string direction);
|
||||
|
||||
/// <summary>
|
||||
/// Emitted when the PLC returns a Modbus exception response (high bit set on FC byte).
|
||||
/// The frame is forwarded verbatim to the client.
|
||||
/// </summary>
|
||||
[LoggerMessage(
|
||||
EventId = 32,
|
||||
EventName = "mbproxy.exception.passthrough",
|
||||
Level = LogLevel.Information,
|
||||
Message = "Modbus exception forwarded: Plc={PlcName} Fc=0x{Fc:X2} ExceptionCode={ExceptionCode}")]
|
||||
public static partial void ExceptionPassthrough(
|
||||
ILogger logger,
|
||||
string plcName,
|
||||
byte fc,
|
||||
byte exceptionCode);
|
||||
}
|
||||
@@ -0,0 +1,404 @@
|
||||
using Mbproxy.Options;
|
||||
using Mbproxy.Proxy.Multiplexing;
|
||||
using Polly;
|
||||
|
||||
namespace Mbproxy.Proxy.Supervision;
|
||||
|
||||
/// <summary>
|
||||
/// Wraps one <see cref="PlcListener"/> in a Polly-backed recovery loop.
|
||||
///
|
||||
/// <para><b>State machine</b>:
|
||||
/// <list type="bullet">
|
||||
/// <item><description><b>Bound</b>: listener is accepting connections; <see cref="PlcListener.RunAsync"/> is awaiting.</description></item>
|
||||
/// <item><description><b>Recovering</b>: bind failed or RunAsync faulted; in Polly's delay window before the next attempt.</description></item>
|
||||
/// <item><description><b>Stopped</b>: terminal. <see cref="StopAsync"/> was called; no further retries.</description></item>
|
||||
/// </list>
|
||||
/// </para>
|
||||
///
|
||||
/// <para><b>RecoveryAttempts</b>: the counter accumulates over the lifetime of the
|
||||
/// supervisor. It is never reset after a successful re-bind so operators can see
|
||||
/// "this listener has flapped N times since the service started." See also
|
||||
/// <see cref="SupervisorSnapshot"/> doc comment.</para>
|
||||
///
|
||||
/// <para>The supervisor does NOT swallow exceptions from <see cref="PlcListener.RunAsync"/>
|
||||
/// except <see cref="OperationCanceledException"/>. Every other fault is logged at Warning
|
||||
/// with the exception message so operators can see WHY the listener was restarted.</para>
|
||||
/// </summary>
|
||||
internal sealed partial class PlcListenerSupervisor : IAsyncDisposable
|
||||
{
|
||||
private readonly PlcOptions _plc;
|
||||
private readonly ConnectionOptions _connectionOptions;
|
||||
private readonly IPduPipeline _pipeline;
|
||||
private readonly ILogger<PlcListener> _listenerLogger;
|
||||
private readonly ILogger<PlcMultiplexer> _multiplexerLogger;
|
||||
private readonly ILogger _pipeLogger;
|
||||
private readonly PerPlcContext? _perPlcContext;
|
||||
private readonly ResiliencePipeline _recoveryPipeline;
|
||||
private readonly ILogger<PlcListenerSupervisor> _logger;
|
||||
private readonly ResiliencePipeline? _backendConnectPipeline;
|
||||
|
||||
// ── Mutable state ────────────────────────────────────────────────────────────────────
|
||||
|
||||
// Volatile so Snapshot() reads are coherent without locking.
|
||||
private volatile SupervisorState _state = SupervisorState.Stopped;
|
||||
private volatile string? _lastBindError;
|
||||
private int _recoveryAttempts; // Interlocked
|
||||
|
||||
// Phase 07: current active listener for status-page pair enumeration.
|
||||
private volatile PlcListener? _currentListener;
|
||||
|
||||
// Phase 06: _perPlcContext is now mutable so ReplaceContextAsync can swap it.
|
||||
// Access from the accept loop (RunAsync) and from ReplaceContextAsync must be
|
||||
// coherent; we use a volatile reference so the accept loop always reads the latest
|
||||
// context without locking. The PlcListener created on each Polly attempt holds
|
||||
// its own copy of the context at construction time; existing in-flight connections
|
||||
// keep their old reference until they complete.
|
||||
private volatile PerPlcContext? _currentContext;
|
||||
|
||||
/// <summary>
|
||||
/// Per-supervisor CTS: cancelling it stops both the Polly delay and the inner
|
||||
/// <see cref="PlcListener.RunAsync"/> loop.
|
||||
/// </summary>
|
||||
private CancellationTokenSource _supervisorCts = new();
|
||||
|
||||
private Task _supervisorTask = Task.CompletedTask;
|
||||
|
||||
private bool _disposed;
|
||||
|
||||
// ── Public surface ────────────────────────────────────────────────────────────────────
|
||||
|
||||
public string PlcName => _plc.Name;
|
||||
|
||||
public PlcListenerSupervisor(
|
||||
PlcOptions plc,
|
||||
ConnectionOptions connectionOptions,
|
||||
IPduPipeline pipeline,
|
||||
ILogger<PlcListener> listenerLogger,
|
||||
ILogger<PlcMultiplexer> multiplexerLogger,
|
||||
ILogger pipeLogger,
|
||||
PerPlcContext? perPlcContext,
|
||||
ResiliencePipeline recoveryPipeline,
|
||||
ILogger<PlcListenerSupervisor> logger,
|
||||
ResiliencePipeline? backendConnectPipeline = null)
|
||||
{
|
||||
_plc = plc;
|
||||
_connectionOptions = connectionOptions;
|
||||
_pipeline = pipeline;
|
||||
_listenerLogger = listenerLogger;
|
||||
_multiplexerLogger = multiplexerLogger;
|
||||
_pipeLogger = pipeLogger;
|
||||
_perPlcContext = perPlcContext;
|
||||
_currentContext = perPlcContext; // Phase 06: live context slot
|
||||
_recoveryPipeline = recoveryPipeline;
|
||||
_logger = logger;
|
||||
_backendConnectPipeline = backendConnectPipeline;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns the current <see cref="ProxyCounters"/> for this PLC.
|
||||
/// Used by <see cref="Configuration.ConfigReconciler"/> when building a reseat context
|
||||
/// so that counters are preserved across a tag-map swap.
|
||||
/// </summary>
|
||||
public ProxyCounters CurrentCounters => _currentContext?.Counters ?? new ProxyCounters();
|
||||
|
||||
/// <summary>
|
||||
/// Live collection of active <see cref="UpstreamPipe"/> instances attached to this
|
||||
/// PLC's multiplexer. Returns an empty collection when the listener is not bound.
|
||||
/// Consumed by Phase 07's status page (renamed from <c>ActivePairs</c> in Phase 9).
|
||||
/// </summary>
|
||||
public IReadOnlyCollection<UpstreamPipe> ActiveUpstreams
|
||||
=> _currentListener?.ActiveUpstreams ?? Array.Empty<UpstreamPipe>();
|
||||
|
||||
/// <summary>
|
||||
/// Launches the supervisor task. The task tries to bind immediately; if binding
|
||||
/// fails it enters the Polly recovery loop. The method returns as soon as the
|
||||
/// background task is started (it does NOT wait for the listener to reach
|
||||
/// <see cref="SupervisorState.Bound"/>).
|
||||
///
|
||||
/// <para>Call <see cref="WaitForInitialBindAttemptAsync"/> after this to block until the
|
||||
/// supervisor has transitioned out of <see cref="SupervisorState.Stopped"/>.</para>
|
||||
/// </summary>
|
||||
public Task StartAsync(CancellationToken ct)
|
||||
{
|
||||
_supervisorCts = CancellationTokenSource.CreateLinkedTokenSource(ct);
|
||||
_supervisorTask = Task.Run(() => RunSupervisorAsync(_supervisorCts.Token), CancellationToken.None);
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Waits until the supervisor has completed its first bind attempt
|
||||
/// (transitioned to <see cref="SupervisorState.Bound"/> or
|
||||
/// <see cref="SupervisorState.Recovering"/>).
|
||||
/// Returns immediately if the supervisor is already past that point.
|
||||
/// </summary>
|
||||
public async Task WaitForInitialBindAttemptAsync(CancellationToken ct)
|
||||
{
|
||||
while (_state == SupervisorState.Stopped && !ct.IsCancellationRequested
|
||||
&& !_supervisorTask.IsCompleted)
|
||||
{
|
||||
await Task.Delay(10, ct).ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Signals the supervisor to stop, cancels the current Polly delay (if in
|
||||
/// <see cref="SupervisorState.Recovering"/>) or the <see cref="PlcListener.RunAsync"/>
|
||||
/// loop (if in <see cref="SupervisorState.Bound"/>), and waits for the background
|
||||
/// task to complete.
|
||||
///
|
||||
/// <para>Completes within ~1 s regardless of backoff window size because Polly's
|
||||
/// <c>ExecuteAsync(ct)</c> honours the cancellation token.</para>
|
||||
/// </summary>
|
||||
public async Task StopAsync(CancellationToken ct)
|
||||
{
|
||||
_state = SupervisorState.Stopped;
|
||||
|
||||
await _supervisorCts.CancelAsync().ConfigureAwait(false);
|
||||
|
||||
try
|
||||
{
|
||||
await _supervisorTask.WaitAsync(ct).ConfigureAwait(false);
|
||||
}
|
||||
catch (OperationCanceledException)
|
||||
{
|
||||
// ct fired before the task completed — supervisor task will terminate
|
||||
// asynchronously. Acceptable at shutdown.
|
||||
}
|
||||
catch (Exception)
|
||||
{
|
||||
// Supervisor task faulted — already logged inside RunSupervisorAsync.
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>Returns a point-in-time snapshot of this supervisor's state.</summary>
|
||||
public SupervisorSnapshot Snapshot() => new(
|
||||
State: _state,
|
||||
LastBindError: _lastBindError,
|
||||
RecoveryAttempts: Interlocked.CompareExchange(ref _recoveryAttempts, 0, 0));
|
||||
|
||||
/// <summary>
|
||||
/// Atomically swaps the per-PLC context (tag map) without restarting the listener.
|
||||
///
|
||||
/// <para><b>Transition window</b>: there is a brief overlap where the old
|
||||
/// <see cref="PlcListener"/> is running its accept loop with the old context while the
|
||||
/// new context reference is being written. The volatile write ensures that the very
|
||||
/// next <c>PlcListener</c> constructed inside the Polly loop (on any subsequent fault
|
||||
/// recovery) picks up <paramref name="newCtx"/>. Existing in-flight upstream pipes
|
||||
/// served by the current multiplexer keep their reference to the context captured at
|
||||
/// multiplexer construction time; they finish on the old map. New connections after
|
||||
/// this call use the new map. This is the correct design — partial-BCD rewrites
|
||||
/// mid-request would be worse than a one-request gap.</para>
|
||||
///
|
||||
/// <para>This method is intentionally lightweight: it performs only the volatile write
|
||||
/// and returns immediately. The <paramref name="ct"/> parameter is present for API
|
||||
/// symmetry with start/stop and to accommodate future async expansion.</para>
|
||||
/// </summary>
|
||||
public Task ReplaceContextAsync(PerPlcContext newCtx, CancellationToken ct)
|
||||
{
|
||||
// Volatile write: the next PlcListener created in RunSupervisorAsync will see
|
||||
// the new context. The accept loop itself does not hold a direct reference to
|
||||
// _currentContext — it was captured at PlcListener construction time.
|
||||
_currentContext = newCtx;
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
|
||||
// ── Supervisor loop ───────────────────────────────────────────────────────────────────
|
||||
|
||||
private async Task RunSupervisorAsync(CancellationToken ct)
|
||||
{
|
||||
bool firstBind = true;
|
||||
|
||||
try
|
||||
{
|
||||
// The recovery pipeline wraps the entire try-bind-and-run block.
|
||||
// When RunAsync returns or throws, the pipeline delays and retries.
|
||||
// Cancellation of ct exits the pipeline with OperationCanceledException.
|
||||
await _recoveryPipeline.ExecuteAsync(async token =>
|
||||
{
|
||||
// ── Instantiate a fresh listener ─────────────────────────────────
|
||||
// A faulted listener's TcpListener socket must be disposed before
|
||||
// re-binding. We create a new PlcListener on each attempt.
|
||||
//
|
||||
// Phase 06: use _currentContext (volatile) so that a ReplaceContextAsync
|
||||
// call between Polly retry attempts is picked up here. Each listener
|
||||
// captures the context at construction time; existing in-flight pairs
|
||||
// keep their own reference. See ReplaceContextAsync for the transition
|
||||
// window documentation.
|
||||
var listener = new PlcListener(
|
||||
_plc,
|
||||
_connectionOptions,
|
||||
_pipeline,
|
||||
_listenerLogger,
|
||||
_multiplexerLogger,
|
||||
_pipeLogger,
|
||||
_currentContext,
|
||||
_backendConnectPipeline);
|
||||
|
||||
// Phase 07: expose the current listener for status-page pair enumeration.
|
||||
_currentListener = listener;
|
||||
|
||||
try
|
||||
{
|
||||
// ── Bind ─────────────────────────────────────────────────────
|
||||
listener.StartAsync();
|
||||
}
|
||||
catch (Exception bindEx)
|
||||
{
|
||||
// Dispose the listener before entering the recovery delay
|
||||
// so the socket is released and the port can be reused.
|
||||
_currentListener = null;
|
||||
await listener.DisposeAsync().ConfigureAwait(false);
|
||||
|
||||
Interlocked.Increment(ref _recoveryAttempts);
|
||||
string reason = bindEx.Message;
|
||||
string truncated = reason.Length > 256 ? reason[..256] : reason;
|
||||
_lastBindError = truncated;
|
||||
_state = SupervisorState.Recovering;
|
||||
|
||||
// Also update the per-PLC counters if available (Phase 07 reads these).
|
||||
_currentContext?.Counters.IncrementRecoveryAttempt(truncated);
|
||||
|
||||
LogBindFailed(_logger, _plc.Name, _plc.ListenPort, truncated);
|
||||
|
||||
// Re-throw so the Polly pipeline can delay and retry.
|
||||
throw;
|
||||
}
|
||||
|
||||
// ── Bind succeeded ───────────────────────────────────────────────
|
||||
if (firstBind)
|
||||
{
|
||||
firstBind = false;
|
||||
LogBound(_logger, _plc.Name, _plc.ListenPort);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Re-bind after a recovery — emit the "recovered" event once.
|
||||
int totalAttempts = Interlocked.CompareExchange(ref _recoveryAttempts, 0, 0);
|
||||
LogListenerRecovered(_logger, _plc.Name, _plc.ListenPort, totalAttempts);
|
||||
}
|
||||
|
||||
// Clear the last bind error on a successful bind.
|
||||
_lastBindError = null;
|
||||
_currentContext?.Counters.ClearLastBindError();
|
||||
_state = SupervisorState.Bound;
|
||||
|
||||
// ── Run the accept loop ──────────────────────────────────────────
|
||||
// RunAsync returns when: (a) token is cancelled (normal shutdown),
|
||||
// (b) the listener faults (OS reclaims port, transient network reset).
|
||||
// In both cases we fall through to the Polly retry handler.
|
||||
try
|
||||
{
|
||||
await listener.RunAsync(token).ConfigureAwait(false);
|
||||
}
|
||||
catch (OperationCanceledException)
|
||||
{
|
||||
// Normal shutdown path — do not enter recovery loop.
|
||||
_currentListener = null;
|
||||
await listener.DisposeAsync().ConfigureAwait(false);
|
||||
throw; // Propagate to exit the Polly pipeline.
|
||||
}
|
||||
catch (Exception runEx)
|
||||
{
|
||||
// Listener faulted at runtime (port stolen, OS network reset, etc.).
|
||||
// Log at Warning — operators must see WHY the listener was restarted.
|
||||
LogListenerFaulted(_logger, _plc.Name, _plc.ListenPort, runEx, runEx.Message);
|
||||
_currentListener = null;
|
||||
await listener.DisposeAsync().ConfigureAwait(false);
|
||||
|
||||
Interlocked.Increment(ref _recoveryAttempts);
|
||||
string truncated = runEx.Message.Length > 256 ? runEx.Message[..256] : runEx.Message;
|
||||
_lastBindError = truncated;
|
||||
_state = SupervisorState.Recovering;
|
||||
|
||||
// Also update the per-PLC counters if available.
|
||||
_currentContext?.Counters.IncrementRecoveryAttempt(truncated);
|
||||
|
||||
// Re-throw so Polly can delay and retry.
|
||||
throw;
|
||||
}
|
||||
|
||||
// RunAsync returned normally (token was cancelled or listener closed).
|
||||
// If we got here without an exception, the loop ended cleanly.
|
||||
_currentListener = null;
|
||||
await listener.DisposeAsync().ConfigureAwait(false);
|
||||
|
||||
// If cancellation is requested, throw so Polly exits cleanly.
|
||||
token.ThrowIfCancellationRequested();
|
||||
|
||||
// Otherwise (listener closed without cancellation — e.g., OS event),
|
||||
// treat as a fault and re-enter recovery.
|
||||
Interlocked.Increment(ref _recoveryAttempts);
|
||||
const string unexpectedEnd = "Listener accept loop ended unexpectedly";
|
||||
_lastBindError = unexpectedEnd;
|
||||
_state = SupervisorState.Recovering;
|
||||
_currentContext?.Counters.IncrementRecoveryAttempt(unexpectedEnd);
|
||||
LogListenerEnded(_logger, _plc.Name, _plc.ListenPort);
|
||||
throw new InvalidOperationException(unexpectedEnd);
|
||||
|
||||
}, ct).ConfigureAwait(false);
|
||||
}
|
||||
catch (OperationCanceledException)
|
||||
{
|
||||
// Normal: StopAsync cancelled the token.
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
// Polly pipeline exhausted (should not happen for listener recovery since
|
||||
// MaxRetryAttempts = int.MaxValue) or an unexpected fault.
|
||||
_logger.LogError(ex, "Supervisor for Plc={Plc} exited unexpectedly: {Message}",
|
||||
_plc.Name, ex.Message);
|
||||
}
|
||||
finally
|
||||
{
|
||||
_state = SupervisorState.Stopped;
|
||||
_currentListener = null;
|
||||
}
|
||||
}
|
||||
|
||||
// ── IAsyncDisposable ─────────────────────────────────────────────────────────────────
|
||||
|
||||
public async ValueTask DisposeAsync()
|
||||
{
|
||||
if (_disposed) return;
|
||||
_disposed = true;
|
||||
|
||||
using var stopCts = new CancellationTokenSource(TimeSpan.FromSeconds(5));
|
||||
try
|
||||
{
|
||||
await StopAsync(stopCts.Token).ConfigureAwait(false);
|
||||
}
|
||||
catch
|
||||
{
|
||||
// Best-effort cleanup.
|
||||
}
|
||||
|
||||
_supervisorCts.Dispose();
|
||||
}
|
||||
|
||||
// ── Logging ───────────────────────────────────────────────────────────────────────────
|
||||
|
||||
[LoggerMessage(EventId = 40, EventName = "mbproxy.startup.bind",
|
||||
Level = LogLevel.Information,
|
||||
Message = "Listener bound: Plc={Plc} Port={Port}")]
|
||||
private static partial void LogBound(ILogger logger, string plc, int port);
|
||||
|
||||
[LoggerMessage(EventId = 41, EventName = "mbproxy.startup.bind.failed",
|
||||
Level = LogLevel.Error,
|
||||
Message = "Failed to bind listener: Plc={Plc} Port={Port} Reason={Reason}")]
|
||||
private static partial void LogBindFailed(ILogger logger, string plc, int port, string reason);
|
||||
|
||||
[LoggerMessage(EventId = 42, EventName = "mbproxy.listener.recovered",
|
||||
Level = LogLevel.Information,
|
||||
Message = "Listener recovered: Plc={Plc} Port={Port} AttemptCount={AttemptCount}")]
|
||||
private static partial void LogListenerRecovered(ILogger logger, string plc, int port, int attemptCount);
|
||||
|
||||
[LoggerMessage(EventId = 43, EventName = "mbproxy.listener.faulted",
|
||||
Level = LogLevel.Warning,
|
||||
Message = "Listener faulted (will recover): Plc={Plc} Port={Port} Reason={Reason}")]
|
||||
private static partial void LogListenerFaulted(ILogger logger, string plc, int port, Exception ex, string reason);
|
||||
|
||||
[LoggerMessage(EventId = 44, EventName = "mbproxy.listener.ended",
|
||||
Level = LogLevel.Warning,
|
||||
Message = "Listener accept loop ended unexpectedly (will recover): Plc={Plc} Port={Port}")]
|
||||
private static partial void LogListenerEnded(ILogger logger, string plc, int port);
|
||||
}
|
||||
@@ -0,0 +1,125 @@
|
||||
using System.Net.Sockets;
|
||||
using Mbproxy.Options;
|
||||
using Polly;
|
||||
using Polly.Retry;
|
||||
|
||||
namespace Mbproxy.Proxy.Supervision;
|
||||
|
||||
/// <summary>
|
||||
/// Builds Polly v8 <see cref="ResiliencePipeline"/> instances from the typed resilience
|
||||
/// configuration (<see cref="RetryProfile"/> and <see cref="RecoveryProfile"/>).
|
||||
///
|
||||
/// <para>Pipelines are built once at startup and reused across all operations. They are
|
||||
/// thread-safe and allocation-free on the happy path.</para>
|
||||
/// </summary>
|
||||
internal static class PolicyFactory
|
||||
{
|
||||
// ── Network errors that are safe to retry on backend connect ────────────────────────
|
||||
// Only these SocketError values are transient; everything else is a programming error
|
||||
// or a configuration mistake and should not be retried.
|
||||
private static readonly HashSet<SocketError> RetryableSocketErrors =
|
||||
[
|
||||
SocketError.ConnectionRefused,
|
||||
SocketError.TimedOut,
|
||||
SocketError.HostUnreachable,
|
||||
SocketError.NetworkUnreachable,
|
||||
];
|
||||
|
||||
/// <summary>
|
||||
/// Builds a retry pipeline for backend (PLC) TCP connect attempts.
|
||||
///
|
||||
/// <para>Retries only on <see cref="SocketException"/> with a
|
||||
/// <see cref="SocketError"/> in <see cref="RetryableSocketErrors"/>. Does NOT retry
|
||||
/// <see cref="ArgumentException"/>, <see cref="OperationCanceledException"/>, or any
|
||||
/// non-network exception.</para>
|
||||
///
|
||||
/// <para>The delay sequence is taken directly from <see cref="RetryProfile.BackoffMs"/>;
|
||||
/// element [i] is the delay before attempt i+1 (0-based). If the attempt index
|
||||
/// exceeds the array, the last element is used.</para>
|
||||
///
|
||||
/// <para>After all attempts are exhausted, the pipeline re-throws the last exception
|
||||
/// so the caller can log <c>mbproxy.backend.failed</c> and close the upstream socket.</para>
|
||||
/// </summary>
|
||||
public static ResiliencePipeline BuildBackendConnect(RetryProfile profile, ILogger logger)
|
||||
{
|
||||
// MaxAttempts in Polly v8 includes the first attempt.
|
||||
int maxAttempts = Math.Max(1, profile.MaxAttempts);
|
||||
var backoffMs = profile.BackoffMs;
|
||||
|
||||
return new ResiliencePipelineBuilder()
|
||||
.AddRetry(new RetryStrategyOptions
|
||||
{
|
||||
MaxRetryAttempts = maxAttempts - 1, // retries = total - 1 (first attempt is free)
|
||||
ShouldHandle = new PredicateBuilder()
|
||||
.Handle<SocketException>(ex => RetryableSocketErrors.Contains(ex.SocketErrorCode)),
|
||||
DelayGenerator = args =>
|
||||
{
|
||||
int idx = args.AttemptNumber; // 0 = first retry, i.e. after attempt 0
|
||||
// Clamp to the last element if we exceed the array.
|
||||
int ms = backoffMs.Count > 0
|
||||
? backoffMs[Math.Min(idx, backoffMs.Count - 1)]
|
||||
: 0;
|
||||
return new ValueTask<TimeSpan?>(TimeSpan.FromMilliseconds(ms));
|
||||
},
|
||||
OnRetry = args =>
|
||||
{
|
||||
logger.LogDebug(
|
||||
"Backend connect retry {Attempt}/{Max}: {Error}",
|
||||
args.AttemptNumber + 1,
|
||||
maxAttempts - 1,
|
||||
args.Outcome.Exception?.Message);
|
||||
return ValueTask.CompletedTask;
|
||||
},
|
||||
})
|
||||
.Build();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Builds an infinite-retry pipeline for listener bind recovery.
|
||||
///
|
||||
/// <para>The delay sequence is:
|
||||
/// <list type="bullet">
|
||||
/// <item><description>Attempts 0 .. (InitialBackoffMs.Length-1) use the initial backoff array.</description></item>
|
||||
/// <item><description>All subsequent attempts use <see cref="RecoveryProfile.SteadyStateMs"/>.</description></item>
|
||||
/// </list>
|
||||
/// The pipeline never exhausts — it retries until the supervisor's cancellation token
|
||||
/// fires (on <see cref="PlcListenerSupervisor.StopAsync"/>).</para>
|
||||
///
|
||||
/// <para>Polly's <c>ExecuteAsync(ct)</c> propagates <see cref="OperationCanceledException"/>
|
||||
/// when <paramref name="ct"/> fires, so the supervisor exits the loop cleanly.</para>
|
||||
/// </summary>
|
||||
public static ResiliencePipeline BuildListenerRecovery(RecoveryProfile profile, ILogger logger)
|
||||
{
|
||||
var initialMs = profile.InitialBackoffMs;
|
||||
int steadyMs = profile.SteadyStateMs;
|
||||
|
||||
return new ResiliencePipelineBuilder()
|
||||
.AddRetry(new RetryStrategyOptions
|
||||
{
|
||||
// int.MaxValue makes the pipeline retry indefinitely; cancellation
|
||||
// is the only exit path (besides the supervisor calling StopAsync).
|
||||
MaxRetryAttempts = int.MaxValue,
|
||||
ShouldHandle = new PredicateBuilder().Handle<Exception>(
|
||||
ex => ex is not OperationCanceledException),
|
||||
DelayGenerator = args =>
|
||||
{
|
||||
// args.AttemptNumber is the zero-based index of the retry
|
||||
// (0 = first retry, after the first failed attempt).
|
||||
int idx = args.AttemptNumber;
|
||||
int ms = idx < initialMs.Count
|
||||
? initialMs[idx]
|
||||
: steadyMs;
|
||||
return new ValueTask<TimeSpan?>(TimeSpan.FromMilliseconds(ms));
|
||||
},
|
||||
OnRetry = args =>
|
||||
{
|
||||
logger.LogDebug(
|
||||
"Listener recovery attempt {Attempt}: {Error}",
|
||||
args.AttemptNumber + 1,
|
||||
args.Outcome.Exception?.Message);
|
||||
return ValueTask.CompletedTask;
|
||||
},
|
||||
})
|
||||
.Build();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,50 @@
|
||||
namespace Mbproxy.Proxy.Supervision;
|
||||
|
||||
/// <summary>
|
||||
/// State machine states for <see cref="PlcListenerSupervisor"/>.
|
||||
/// </summary>
|
||||
public enum SupervisorState
|
||||
{
|
||||
/// <summary>
|
||||
/// The listener is bound and its accept loop is running.
|
||||
/// Entry conditions: <see cref="PlcListener.StartAsync"/> succeeded (on first attempt or
|
||||
/// after a recovery attempt).
|
||||
/// </summary>
|
||||
Bound,
|
||||
|
||||
/// <summary>
|
||||
/// The listener is not bound; the supervisor is waiting for the next Polly retry delay
|
||||
/// before reattempting. Entered after any failed bind (at startup or at runtime).
|
||||
/// </summary>
|
||||
Recovering,
|
||||
|
||||
/// <summary>
|
||||
/// Terminal state. <see cref="PlcListenerSupervisor.StopAsync"/> was called; the supervisor
|
||||
/// task has been cancelled and will not retry.
|
||||
/// </summary>
|
||||
Stopped,
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Immutable point-in-time snapshot of a supervisor's state. Consumed by Phase 07's
|
||||
/// status page via <see cref="PlcListenerSupervisor.Snapshot"/>.
|
||||
///
|
||||
/// <para><b>RecoveryAttempts semantics</b>: this counter <em>accumulates over the lifetime
|
||||
/// of the supervisor</em> and is never reset. Operators reading the status page should
|
||||
/// interpret it as "how many times has this listener faulted or failed to bind since
|
||||
/// the service started" — useful for detecting port-flapping or repeated OS network
|
||||
/// resets. Phase 07 surfaces it as-is.</para>
|
||||
/// </summary>
|
||||
/// <param name="State">Current state of the supervisor.</param>
|
||||
/// <param name="LastBindError">
|
||||
/// Most recent bind failure message (up to 256 chars). <c>null</c> if the listener
|
||||
/// has never failed to bind.
|
||||
/// </param>
|
||||
/// <param name="RecoveryAttempts">
|
||||
/// Total number of failed bind attempts over the lifetime of this supervisor.
|
||||
/// Accumulates; never resets to 0.
|
||||
/// </param>
|
||||
public sealed record SupervisorSnapshot(
|
||||
SupervisorState State,
|
||||
string? LastBindError,
|
||||
int RecoveryAttempts);
|
||||
Reference in New Issue
Block a user