mbproxy: initial commit through Phase 9 (TxId multiplexing)

Adds the mbproxy service end-to-end. Phases 00-08 implement the
production-ready single-listener / 1:1-backend transparent Modbus TCP
proxy with bidirectional BCD rewriting for the ~54-PLC DL205/DL260
fleet. Phase 9 replaces the connection layer with a single backend
socket per PLC plus MBAP TxId rewriting, lifting the H2-ECOM100's
4-concurrent-client cap as an operational ceiling.

Phase 9 additions of note:
- PlcMultiplexer + UpstreamPipe + TxIdAllocator + CorrelationMap
- InFlightRequest with IReadOnlyList<InterestedParty> (load-bearing
  for Phase 10 read coalescing — do not collapse to a single field)
- Per-request watchdog: surfaces Modbus exception 0x0B to upstream
  on BackendRequestTimeoutMs, defending against lost responses,
  dead-PLC paths, and pymodbus 3.13.0's concurrent-multiplexed-
  request bug (its ServerRequestHandler.last_pdu state race)
- Status DTO + HTML gain inFlight / maxInFlight / txIdWraps /
  disconnectCascades / queueDepth (Tier 1.6 in docs/kpi.md)

Tests: 263 unit + 38 E2E. Multiplexer correctness under truly
concurrent backend traffic is proved against a stub backend in
PlcMultiplexerTests; MultiplexerE2ETests paces requests so pymodbus
3.13's single-PDU framer stays in known-good mode.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Joseph Doherty
2026-05-14 01:49:35 -04:00
parent 2e937228a0
commit 56eee3c563
105 changed files with 18430 additions and 0 deletions
+336
View File
@@ -0,0 +1,336 @@
namespace Mbproxy.Proxy;
/// <summary>
/// Immutable snapshot of per-PLC counters. Consumed by Phase 07's status page.
/// All fields are point-in-time reads; no ordering guarantees across fields.
///
/// <para><b>Backwards-compat policy (see docs/kpi.md):</b> fields are <i>added</i>, never
/// renamed or removed. Phase 9 appended <c>InFlightCount</c>, <c>MaxInFlight</c>,
/// <c>TxIdWraps</c>, <c>BackendDisconnectCascades</c>, and <c>BackendQueueDepth</c> for
/// the TxId-multiplexer telemetry surface (Tier 1.6 in docs/kpi.md).</para>
/// </summary>
public sealed record CounterSnapshot(
long PdusForwarded,
long Fc03,
long Fc04,
long Fc06,
long Fc16,
long FcOther,
long RewrittenSlots,
long PartialBcdWarnings,
long InvalidBcdWarnings,
long BackendException01,
long BackendException02,
long BackendException03,
long BackendException04,
long BackendExceptionOther,
long BytesUpstreamIn,
long BytesUpstreamOut,
/// <summary>
/// Total number of failed listener bind attempts over the lifetime of the supervisor.
/// Accumulates; never resets. See <see cref="SupervisorSnapshot.RecoveryAttempts"/> doc.
/// </summary>
long RecoveryAttempts,
/// <summary>
/// Most recent bind failure message (up to 256 chars); <c>null</c> if the listener
/// has never failed to bind.
/// </summary>
string? LastBindError,
/// <summary>
/// EWMA of recent backend round-trip times in milliseconds (α = 0.2).
/// Zero when no successful round-trips have been observed yet.
/// Stored internally as fixed-point microseconds (long * 1000) for Interlocked
/// compatibility; converted to double ms on snapshot.
/// </summary>
double LastRoundTripMs,
/// <summary>
/// Number of backend connections successfully established (Polly final success).
/// </summary>
long ConnectsSuccess,
/// <summary>
/// Number of backend connections that failed on all Polly attempts.
/// </summary>
long ConnectsFailed,
/// <summary>
/// Number of Modbus requests currently in flight on this PLC's multiplexed backend
/// connection (point-in-time snapshot of the correlation map size). Phase 9.
/// </summary>
long InFlightCount,
/// <summary>
/// Peak <see cref="InFlightCount"/> observed since the multiplexer was constructed.
/// Updated via <see cref="Interlocked"/> CAS so concurrent in-flight increments do not
/// lose the high-water mark. Phase 9.
/// </summary>
long MaxInFlight,
/// <summary>
/// Number of times the per-PLC TxId allocator's rolling cursor has wrapped
/// 0xFFFF → 0x0000. A non-zero value is benign; a sudden burst suggests extreme
/// in-flight churn. Phase 9.
/// </summary>
long TxIdWraps,
/// <summary>
/// Cumulative count of upstream pipes closed as a side effect of a backend disconnect.
/// Each backend reconnect cycle adds the number of attached upstream clients at the
/// time of the disconnect. Phase 9.
/// </summary>
long BackendDisconnectCascades,
/// <summary>
/// Current depth of the per-PLC outbound channel feeding the backend writer task
/// (frames queued, not yet on the wire). A sustained non-zero value indicates the
/// backend is slower than upstream demand. Phase 9.
/// </summary>
long BackendQueueDepth);
/// <summary>
/// Thread-safe per-PLC counters backed by <see cref="System.Threading.Interlocked"/> longs.
/// All increment methods are allocation-free (no boxing, no heap traffic on the hot path).
/// <see cref="Snapshot"/> may allocate (record construction) — it is off-path (status page only).
/// </summary>
internal sealed class ProxyCounters
{
// ── Hot-path fields (Interlocked longs) ─────────────────────────────────
private long _pdusForwarded;
private long _fc03;
private long _fc04;
private long _fc06;
private long _fc16;
private long _fcOther;
private long _rewrittenSlots;
private long _partialBcdWarnings;
private long _invalidBcdWarnings;
private long _backendException01;
private long _backendException02;
private long _backendException03;
private long _backendException04;
private long _backendExceptionOther;
private long _bytesUpstreamIn;
private long _bytesUpstreamOut;
private long _recoveryAttempts;
private long _connectsSuccess;
private long _connectsFailed;
// Phase 9 multiplexer telemetry.
private long _maxInFlight;
private long _backendDisconnectCascades;
// Phase 9: live state pulled from the multiplexer's allocator/map/queue on each
// snapshot. The multiplexer registers a single provider via SetMultiplexProvider.
// We use a volatile reference for lock-free read on the snapshot path.
private volatile IMultiplexCountersProvider? _multiplexProvider;
// LastBindError is a string (not a long); accessed via volatile field on ProxyCounters
// but actually stored on the supervisor. We expose it here for snapshot parity.
// Supervisor sets this via SetLastBindError; Snapshot reads it.
private volatile string? _lastBindError;
// EWMA round-trip: stored as fixed-point microseconds (value * 1000) so we can use
// Interlocked.CompareExchange on a long. The EWMA smoothing factor α = 0.2 gives a
// half-life of ~3 samples (responds quickly to changes without being noisy).
// Updated by PlcMultiplexer on each successful response (request→response round-trip,
// measured against InFlightRequest.SentAtUtc).
// 0 = no samples observed yet.
private long _lastRoundTripUsEwma; // fixed-point microseconds
// ── Increment methods ────────────────────────────────────────────────────
public void IncrementPdusForwarded()
=> Interlocked.Increment(ref _pdusForwarded);
public void IncrementFcCount(byte fc)
{
switch (fc)
{
case 0x03: Interlocked.Increment(ref _fc03); break;
case 0x04: Interlocked.Increment(ref _fc04); break;
case 0x06: Interlocked.Increment(ref _fc06); break;
case 0x10: Interlocked.Increment(ref _fc16); break;
default: Interlocked.Increment(ref _fcOther); break;
}
}
public void AddRewrittenSlots(int n)
=> Interlocked.Add(ref _rewrittenSlots, n);
public void IncrementPartialBcd()
=> Interlocked.Increment(ref _partialBcdWarnings);
public void IncrementInvalidBcd()
=> Interlocked.Increment(ref _invalidBcdWarnings);
/// <summary>
/// Increments the backend-exception counter for the given Modbus exception code.
/// Codes 14 map to individual counters; anything else goes to "Other".
/// </summary>
public void IncrementBackendException(byte code)
{
switch (code)
{
case 1: Interlocked.Increment(ref _backendException01); break;
case 2: Interlocked.Increment(ref _backendException02); break;
case 3: Interlocked.Increment(ref _backendException03); break;
case 4: Interlocked.Increment(ref _backendException04); break;
default: Interlocked.Increment(ref _backendExceptionOther); break;
}
}
/// <summary>
/// Adds byte counts for both upstream directions atomically.
/// </summary>
public void AddBytes(long up, long down)
{
Interlocked.Add(ref _bytesUpstreamIn, up);
Interlocked.Add(ref _bytesUpstreamOut, down);
}
/// <summary>
/// Records one successful backend TCP connect (Polly pipeline returned success).
/// </summary>
public void IncrementConnectSuccess()
=> Interlocked.Increment(ref _connectsSuccess);
/// <summary>
/// Records one failed backend TCP connect (all Polly attempts exhausted).
/// </summary>
public void IncrementConnectFailed()
=> Interlocked.Increment(ref _connectsFailed);
/// <summary>
/// Records <paramref name="n"/> upstream pipes closed by a backend disconnect cascade.
/// Phase 9.
/// </summary>
public void AddDisconnectCascades(int n)
=> Interlocked.Add(ref _backendDisconnectCascades, n);
/// <summary>
/// CAS-updates the peak in-flight high-water mark. Called on every successful
/// allocation by the multiplexer. Phase 9.
/// </summary>
public void ObserveInFlight(int currentInFlight)
{
long sample = currentInFlight;
long old;
do
{
old = Interlocked.Read(ref _maxInFlight);
if (sample <= old) return;
}
while (Interlocked.CompareExchange(ref _maxInFlight, sample, old) != old);
}
/// <summary>
/// Wires the live multiplexer telemetry source into this counter set. Called by
/// <see cref="Mbproxy.Proxy.Multiplexing.PlcMultiplexer"/> at construction time so
/// the status page's <see cref="Snapshot"/> can include live in-flight / queue-depth
/// values without polling the multiplexer separately. Phase 9.
/// </summary>
internal void SetMultiplexProvider(IMultiplexCountersProvider? provider)
=> _multiplexProvider = provider;
/// <summary>
/// Increments the recovery-attempt counter and records the bind error message
/// (truncated to 256 chars). Called by the supervisor on each failed bind.
/// </summary>
public void IncrementRecoveryAttempt(string errorMessage)
{
Interlocked.Increment(ref _recoveryAttempts);
_lastBindError = errorMessage.Length > 256 ? errorMessage[..256] : errorMessage;
}
/// <summary>
/// Clears the last bind error after a successful bind.
/// </summary>
public void ClearLastBindError()
{
_lastBindError = null;
}
/// <summary>
/// Updates the EWMA round-trip estimate with a new sample.
/// Uses α = 0.2: new_ewma = 0.2 * sample + 0.8 * old_ewma.
/// <paramref name="elapsedTicks"/> is from <see cref="System.Diagnostics.Stopwatch.GetTimestamp"/>.
/// Thread-safe via CAS loop on a fixed-point microsecond long.
/// </summary>
public void UpdateRoundTripEwma(long elapsedTicks)
{
// Convert ticks to microseconds.
double sampleMs = (double)elapsedTicks / System.Diagnostics.Stopwatch.Frequency * 1000.0;
// Fixed-point: store microseconds * 1000 (i.e. nanoseconds) as long for CAS.
// This gives ~1 µs resolution which is fine for Modbus round-trips (1100 ms range).
long sampleFixed = (long)(sampleMs * 1000.0);
long old, newVal;
do
{
old = Interlocked.Read(ref _lastRoundTripUsEwma);
// If no previous sample, seed with first sample; otherwise apply EWMA.
newVal = old == 0
? sampleFixed
: (long)(0.2 * sampleFixed + 0.8 * old);
}
while (Interlocked.CompareExchange(ref _lastRoundTripUsEwma, newVal, old) != old);
}
// ── Snapshot (off hot-path, may allocate) ────────────────────────────────
/// <summary>
/// Returns a point-in-time snapshot of all counters.
/// Each field is read atomically via <see cref="Interlocked.Read"/>.
/// May allocate (record construction); intended for the status-page path only.
/// </summary>
public CounterSnapshot Snapshot()
{
var provider = _multiplexProvider;
long inFlightNow = provider?.InFlightCount ?? 0;
long txWraps = provider?.TxIdWraps ?? 0;
long queueDepth = provider?.BackendQueueDepth ?? 0;
return new(
PdusForwarded: Interlocked.Read(ref _pdusForwarded),
Fc03: Interlocked.Read(ref _fc03),
Fc04: Interlocked.Read(ref _fc04),
Fc06: Interlocked.Read(ref _fc06),
Fc16: Interlocked.Read(ref _fc16),
FcOther: Interlocked.Read(ref _fcOther),
RewrittenSlots: Interlocked.Read(ref _rewrittenSlots),
PartialBcdWarnings: Interlocked.Read(ref _partialBcdWarnings),
InvalidBcdWarnings: Interlocked.Read(ref _invalidBcdWarnings),
BackendException01: Interlocked.Read(ref _backendException01),
BackendException02: Interlocked.Read(ref _backendException02),
BackendException03: Interlocked.Read(ref _backendException03),
BackendException04: Interlocked.Read(ref _backendException04),
BackendExceptionOther: Interlocked.Read(ref _backendExceptionOther),
BytesUpstreamIn: Interlocked.Read(ref _bytesUpstreamIn),
BytesUpstreamOut: Interlocked.Read(ref _bytesUpstreamOut),
RecoveryAttempts: Interlocked.Read(ref _recoveryAttempts),
LastBindError: _lastBindError,
LastRoundTripMs: Interlocked.Read(ref _lastRoundTripUsEwma) / 1000.0,
ConnectsSuccess: Interlocked.Read(ref _connectsSuccess),
ConnectsFailed: Interlocked.Read(ref _connectsFailed),
InFlightCount: inFlightNow,
MaxInFlight: Interlocked.Read(ref _maxInFlight),
TxIdWraps: txWraps,
BackendDisconnectCascades: Interlocked.Read(ref _backendDisconnectCascades),
BackendQueueDepth: queueDepth);
}
}
/// <summary>
/// Read-only window into the per-PLC multiplexer's live state (allocator counts,
/// outbound-queue depth). Implemented by <see cref="Mbproxy.Proxy.Multiplexing.PlcMultiplexer"/>
/// and registered with <see cref="ProxyCounters.SetMultiplexProvider"/> so
/// <see cref="ProxyCounters.Snapshot"/> can include live mux telemetry without holding
/// a direct reference to the multiplexer (which would couple counter snapshots to the
/// connection layer's lifecycle). Phase 9.
/// </summary>
internal interface IMultiplexCountersProvider
{
/// <summary>Number of currently-in-flight requests on the backend socket.</summary>
long InFlightCount { get; }
/// <summary>Cumulative 0xFFFF → 0x0000 wrap events from the TxId allocator.</summary>
long TxIdWraps { get; }
/// <summary>Current depth of the outbound channel (frames queued for the backend writer).</summary>
long BackendQueueDepth { get; }
}