mbproxy: initial commit through Phase 9 (TxId multiplexing)
Adds the mbproxy service end-to-end. Phases 00-08 implement the production-ready single-listener / 1:1-backend transparent Modbus TCP proxy with bidirectional BCD rewriting for the ~54-PLC DL205/DL260 fleet. Phase 9 replaces the connection layer with a single backend socket per PLC plus MBAP TxId rewriting, lifting the H2-ECOM100's 4-concurrent-client cap as an operational ceiling. Phase 9 additions of note: - PlcMultiplexer + UpstreamPipe + TxIdAllocator + CorrelationMap - InFlightRequest with IReadOnlyList<InterestedParty> (load-bearing for Phase 10 read coalescing — do not collapse to a single field) - Per-request watchdog: surfaces Modbus exception 0x0B to upstream on BackendRequestTimeoutMs, defending against lost responses, dead-PLC paths, and pymodbus 3.13.0's concurrent-multiplexed- request bug (its ServerRequestHandler.last_pdu state race) - Status DTO + HTML gain inFlight / maxInFlight / txIdWraps / disconnectCascades / queueDepth (Tier 1.6 in docs/kpi.md) Tests: 263 unit + 38 E2E. Multiplexer correctness under truly concurrent backend traffic is proved against a stub backend in PlcMultiplexerTests; MultiplexerE2ETests paces requests so pymodbus 3.13's single-PDU framer stays in known-good mode. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,336 @@
|
||||
namespace Mbproxy.Proxy;
|
||||
|
||||
/// <summary>
|
||||
/// Immutable snapshot of per-PLC counters. Consumed by Phase 07's status page.
|
||||
/// All fields are point-in-time reads; no ordering guarantees across fields.
|
||||
///
|
||||
/// <para><b>Backwards-compat policy (see docs/kpi.md):</b> fields are <i>added</i>, never
|
||||
/// renamed or removed. Phase 9 appended <c>InFlightCount</c>, <c>MaxInFlight</c>,
|
||||
/// <c>TxIdWraps</c>, <c>BackendDisconnectCascades</c>, and <c>BackendQueueDepth</c> for
|
||||
/// the TxId-multiplexer telemetry surface (Tier 1.6 in docs/kpi.md).</para>
|
||||
/// </summary>
|
||||
public sealed record CounterSnapshot(
|
||||
long PdusForwarded,
|
||||
long Fc03,
|
||||
long Fc04,
|
||||
long Fc06,
|
||||
long Fc16,
|
||||
long FcOther,
|
||||
long RewrittenSlots,
|
||||
long PartialBcdWarnings,
|
||||
long InvalidBcdWarnings,
|
||||
long BackendException01,
|
||||
long BackendException02,
|
||||
long BackendException03,
|
||||
long BackendException04,
|
||||
long BackendExceptionOther,
|
||||
long BytesUpstreamIn,
|
||||
long BytesUpstreamOut,
|
||||
/// <summary>
|
||||
/// Total number of failed listener bind attempts over the lifetime of the supervisor.
|
||||
/// Accumulates; never resets. See <see cref="SupervisorSnapshot.RecoveryAttempts"/> doc.
|
||||
/// </summary>
|
||||
long RecoveryAttempts,
|
||||
/// <summary>
|
||||
/// Most recent bind failure message (up to 256 chars); <c>null</c> if the listener
|
||||
/// has never failed to bind.
|
||||
/// </summary>
|
||||
string? LastBindError,
|
||||
/// <summary>
|
||||
/// EWMA of recent backend round-trip times in milliseconds (α = 0.2).
|
||||
/// Zero when no successful round-trips have been observed yet.
|
||||
/// Stored internally as fixed-point microseconds (long * 1000) for Interlocked
|
||||
/// compatibility; converted to double ms on snapshot.
|
||||
/// </summary>
|
||||
double LastRoundTripMs,
|
||||
/// <summary>
|
||||
/// Number of backend connections successfully established (Polly final success).
|
||||
/// </summary>
|
||||
long ConnectsSuccess,
|
||||
/// <summary>
|
||||
/// Number of backend connections that failed on all Polly attempts.
|
||||
/// </summary>
|
||||
long ConnectsFailed,
|
||||
/// <summary>
|
||||
/// Number of Modbus requests currently in flight on this PLC's multiplexed backend
|
||||
/// connection (point-in-time snapshot of the correlation map size). Phase 9.
|
||||
/// </summary>
|
||||
long InFlightCount,
|
||||
/// <summary>
|
||||
/// Peak <see cref="InFlightCount"/> observed since the multiplexer was constructed.
|
||||
/// Updated via <see cref="Interlocked"/> CAS so concurrent in-flight increments do not
|
||||
/// lose the high-water mark. Phase 9.
|
||||
/// </summary>
|
||||
long MaxInFlight,
|
||||
/// <summary>
|
||||
/// Number of times the per-PLC TxId allocator's rolling cursor has wrapped
|
||||
/// 0xFFFF → 0x0000. A non-zero value is benign; a sudden burst suggests extreme
|
||||
/// in-flight churn. Phase 9.
|
||||
/// </summary>
|
||||
long TxIdWraps,
|
||||
/// <summary>
|
||||
/// Cumulative count of upstream pipes closed as a side effect of a backend disconnect.
|
||||
/// Each backend reconnect cycle adds the number of attached upstream clients at the
|
||||
/// time of the disconnect. Phase 9.
|
||||
/// </summary>
|
||||
long BackendDisconnectCascades,
|
||||
/// <summary>
|
||||
/// Current depth of the per-PLC outbound channel feeding the backend writer task
|
||||
/// (frames queued, not yet on the wire). A sustained non-zero value indicates the
|
||||
/// backend is slower than upstream demand. Phase 9.
|
||||
/// </summary>
|
||||
long BackendQueueDepth);
|
||||
|
||||
/// <summary>
|
||||
/// Thread-safe per-PLC counters backed by <see cref="System.Threading.Interlocked"/> longs.
|
||||
/// All increment methods are allocation-free (no boxing, no heap traffic on the hot path).
|
||||
/// <see cref="Snapshot"/> may allocate (record construction) — it is off-path (status page only).
|
||||
/// </summary>
|
||||
internal sealed class ProxyCounters
|
||||
{
|
||||
// ── Hot-path fields (Interlocked longs) ─────────────────────────────────
|
||||
|
||||
private long _pdusForwarded;
|
||||
private long _fc03;
|
||||
private long _fc04;
|
||||
private long _fc06;
|
||||
private long _fc16;
|
||||
private long _fcOther;
|
||||
private long _rewrittenSlots;
|
||||
private long _partialBcdWarnings;
|
||||
private long _invalidBcdWarnings;
|
||||
private long _backendException01;
|
||||
private long _backendException02;
|
||||
private long _backendException03;
|
||||
private long _backendException04;
|
||||
private long _backendExceptionOther;
|
||||
private long _bytesUpstreamIn;
|
||||
private long _bytesUpstreamOut;
|
||||
private long _recoveryAttempts;
|
||||
private long _connectsSuccess;
|
||||
private long _connectsFailed;
|
||||
|
||||
// Phase 9 multiplexer telemetry.
|
||||
private long _maxInFlight;
|
||||
private long _backendDisconnectCascades;
|
||||
|
||||
// Phase 9: live state pulled from the multiplexer's allocator/map/queue on each
|
||||
// snapshot. The multiplexer registers a single provider via SetMultiplexProvider.
|
||||
// We use a volatile reference for lock-free read on the snapshot path.
|
||||
private volatile IMultiplexCountersProvider? _multiplexProvider;
|
||||
// LastBindError is a string (not a long); accessed via volatile field on ProxyCounters
|
||||
// but actually stored on the supervisor. We expose it here for snapshot parity.
|
||||
// Supervisor sets this via SetLastBindError; Snapshot reads it.
|
||||
private volatile string? _lastBindError;
|
||||
|
||||
// EWMA round-trip: stored as fixed-point microseconds (value * 1000) so we can use
|
||||
// Interlocked.CompareExchange on a long. The EWMA smoothing factor α = 0.2 gives a
|
||||
// half-life of ~3 samples (responds quickly to changes without being noisy).
|
||||
// Updated by PlcMultiplexer on each successful response (request→response round-trip,
|
||||
// measured against InFlightRequest.SentAtUtc).
|
||||
// 0 = no samples observed yet.
|
||||
private long _lastRoundTripUsEwma; // fixed-point microseconds
|
||||
|
||||
// ── Increment methods ────────────────────────────────────────────────────
|
||||
|
||||
public void IncrementPdusForwarded()
|
||||
=> Interlocked.Increment(ref _pdusForwarded);
|
||||
|
||||
public void IncrementFcCount(byte fc)
|
||||
{
|
||||
switch (fc)
|
||||
{
|
||||
case 0x03: Interlocked.Increment(ref _fc03); break;
|
||||
case 0x04: Interlocked.Increment(ref _fc04); break;
|
||||
case 0x06: Interlocked.Increment(ref _fc06); break;
|
||||
case 0x10: Interlocked.Increment(ref _fc16); break;
|
||||
default: Interlocked.Increment(ref _fcOther); break;
|
||||
}
|
||||
}
|
||||
|
||||
public void AddRewrittenSlots(int n)
|
||||
=> Interlocked.Add(ref _rewrittenSlots, n);
|
||||
|
||||
public void IncrementPartialBcd()
|
||||
=> Interlocked.Increment(ref _partialBcdWarnings);
|
||||
|
||||
public void IncrementInvalidBcd()
|
||||
=> Interlocked.Increment(ref _invalidBcdWarnings);
|
||||
|
||||
/// <summary>
|
||||
/// Increments the backend-exception counter for the given Modbus exception code.
|
||||
/// Codes 1–4 map to individual counters; anything else goes to "Other".
|
||||
/// </summary>
|
||||
public void IncrementBackendException(byte code)
|
||||
{
|
||||
switch (code)
|
||||
{
|
||||
case 1: Interlocked.Increment(ref _backendException01); break;
|
||||
case 2: Interlocked.Increment(ref _backendException02); break;
|
||||
case 3: Interlocked.Increment(ref _backendException03); break;
|
||||
case 4: Interlocked.Increment(ref _backendException04); break;
|
||||
default: Interlocked.Increment(ref _backendExceptionOther); break;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Adds byte counts for both upstream directions atomically.
|
||||
/// </summary>
|
||||
public void AddBytes(long up, long down)
|
||||
{
|
||||
Interlocked.Add(ref _bytesUpstreamIn, up);
|
||||
Interlocked.Add(ref _bytesUpstreamOut, down);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Records one successful backend TCP connect (Polly pipeline returned success).
|
||||
/// </summary>
|
||||
public void IncrementConnectSuccess()
|
||||
=> Interlocked.Increment(ref _connectsSuccess);
|
||||
|
||||
/// <summary>
|
||||
/// Records one failed backend TCP connect (all Polly attempts exhausted).
|
||||
/// </summary>
|
||||
public void IncrementConnectFailed()
|
||||
=> Interlocked.Increment(ref _connectsFailed);
|
||||
|
||||
/// <summary>
|
||||
/// Records <paramref name="n"/> upstream pipes closed by a backend disconnect cascade.
|
||||
/// Phase 9.
|
||||
/// </summary>
|
||||
public void AddDisconnectCascades(int n)
|
||||
=> Interlocked.Add(ref _backendDisconnectCascades, n);
|
||||
|
||||
/// <summary>
|
||||
/// CAS-updates the peak in-flight high-water mark. Called on every successful
|
||||
/// allocation by the multiplexer. Phase 9.
|
||||
/// </summary>
|
||||
public void ObserveInFlight(int currentInFlight)
|
||||
{
|
||||
long sample = currentInFlight;
|
||||
long old;
|
||||
do
|
||||
{
|
||||
old = Interlocked.Read(ref _maxInFlight);
|
||||
if (sample <= old) return;
|
||||
}
|
||||
while (Interlocked.CompareExchange(ref _maxInFlight, sample, old) != old);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Wires the live multiplexer telemetry source into this counter set. Called by
|
||||
/// <see cref="Mbproxy.Proxy.Multiplexing.PlcMultiplexer"/> at construction time so
|
||||
/// the status page's <see cref="Snapshot"/> can include live in-flight / queue-depth
|
||||
/// values without polling the multiplexer separately. Phase 9.
|
||||
/// </summary>
|
||||
internal void SetMultiplexProvider(IMultiplexCountersProvider? provider)
|
||||
=> _multiplexProvider = provider;
|
||||
|
||||
/// <summary>
|
||||
/// Increments the recovery-attempt counter and records the bind error message
|
||||
/// (truncated to 256 chars). Called by the supervisor on each failed bind.
|
||||
/// </summary>
|
||||
public void IncrementRecoveryAttempt(string errorMessage)
|
||||
{
|
||||
Interlocked.Increment(ref _recoveryAttempts);
|
||||
_lastBindError = errorMessage.Length > 256 ? errorMessage[..256] : errorMessage;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Clears the last bind error after a successful bind.
|
||||
/// </summary>
|
||||
public void ClearLastBindError()
|
||||
{
|
||||
_lastBindError = null;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Updates the EWMA round-trip estimate with a new sample.
|
||||
/// Uses α = 0.2: new_ewma = 0.2 * sample + 0.8 * old_ewma.
|
||||
/// <paramref name="elapsedTicks"/> is from <see cref="System.Diagnostics.Stopwatch.GetTimestamp"/>.
|
||||
/// Thread-safe via CAS loop on a fixed-point microsecond long.
|
||||
/// </summary>
|
||||
public void UpdateRoundTripEwma(long elapsedTicks)
|
||||
{
|
||||
// Convert ticks to microseconds.
|
||||
double sampleMs = (double)elapsedTicks / System.Diagnostics.Stopwatch.Frequency * 1000.0;
|
||||
|
||||
// Fixed-point: store microseconds * 1000 (i.e. nanoseconds) as long for CAS.
|
||||
// This gives ~1 µs resolution which is fine for Modbus round-trips (1–100 ms range).
|
||||
long sampleFixed = (long)(sampleMs * 1000.0);
|
||||
|
||||
long old, newVal;
|
||||
do
|
||||
{
|
||||
old = Interlocked.Read(ref _lastRoundTripUsEwma);
|
||||
// If no previous sample, seed with first sample; otherwise apply EWMA.
|
||||
newVal = old == 0
|
||||
? sampleFixed
|
||||
: (long)(0.2 * sampleFixed + 0.8 * old);
|
||||
}
|
||||
while (Interlocked.CompareExchange(ref _lastRoundTripUsEwma, newVal, old) != old);
|
||||
}
|
||||
|
||||
// ── Snapshot (off hot-path, may allocate) ────────────────────────────────
|
||||
|
||||
/// <summary>
|
||||
/// Returns a point-in-time snapshot of all counters.
|
||||
/// Each field is read atomically via <see cref="Interlocked.Read"/>.
|
||||
/// May allocate (record construction); intended for the status-page path only.
|
||||
/// </summary>
|
||||
public CounterSnapshot Snapshot()
|
||||
{
|
||||
var provider = _multiplexProvider;
|
||||
long inFlightNow = provider?.InFlightCount ?? 0;
|
||||
long txWraps = provider?.TxIdWraps ?? 0;
|
||||
long queueDepth = provider?.BackendQueueDepth ?? 0;
|
||||
|
||||
return new(
|
||||
PdusForwarded: Interlocked.Read(ref _pdusForwarded),
|
||||
Fc03: Interlocked.Read(ref _fc03),
|
||||
Fc04: Interlocked.Read(ref _fc04),
|
||||
Fc06: Interlocked.Read(ref _fc06),
|
||||
Fc16: Interlocked.Read(ref _fc16),
|
||||
FcOther: Interlocked.Read(ref _fcOther),
|
||||
RewrittenSlots: Interlocked.Read(ref _rewrittenSlots),
|
||||
PartialBcdWarnings: Interlocked.Read(ref _partialBcdWarnings),
|
||||
InvalidBcdWarnings: Interlocked.Read(ref _invalidBcdWarnings),
|
||||
BackendException01: Interlocked.Read(ref _backendException01),
|
||||
BackendException02: Interlocked.Read(ref _backendException02),
|
||||
BackendException03: Interlocked.Read(ref _backendException03),
|
||||
BackendException04: Interlocked.Read(ref _backendException04),
|
||||
BackendExceptionOther: Interlocked.Read(ref _backendExceptionOther),
|
||||
BytesUpstreamIn: Interlocked.Read(ref _bytesUpstreamIn),
|
||||
BytesUpstreamOut: Interlocked.Read(ref _bytesUpstreamOut),
|
||||
RecoveryAttempts: Interlocked.Read(ref _recoveryAttempts),
|
||||
LastBindError: _lastBindError,
|
||||
LastRoundTripMs: Interlocked.Read(ref _lastRoundTripUsEwma) / 1000.0,
|
||||
ConnectsSuccess: Interlocked.Read(ref _connectsSuccess),
|
||||
ConnectsFailed: Interlocked.Read(ref _connectsFailed),
|
||||
InFlightCount: inFlightNow,
|
||||
MaxInFlight: Interlocked.Read(ref _maxInFlight),
|
||||
TxIdWraps: txWraps,
|
||||
BackendDisconnectCascades: Interlocked.Read(ref _backendDisconnectCascades),
|
||||
BackendQueueDepth: queueDepth);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Read-only window into the per-PLC multiplexer's live state (allocator counts,
|
||||
/// outbound-queue depth). Implemented by <see cref="Mbproxy.Proxy.Multiplexing.PlcMultiplexer"/>
|
||||
/// and registered with <see cref="ProxyCounters.SetMultiplexProvider"/> so
|
||||
/// <see cref="ProxyCounters.Snapshot"/> can include live mux telemetry without holding
|
||||
/// a direct reference to the multiplexer (which would couple counter snapshots to the
|
||||
/// connection layer's lifecycle). Phase 9.
|
||||
/// </summary>
|
||||
internal interface IMultiplexCountersProvider
|
||||
{
|
||||
/// <summary>Number of currently-in-flight requests on the backend socket.</summary>
|
||||
long InFlightCount { get; }
|
||||
|
||||
/// <summary>Cumulative 0xFFFF → 0x0000 wrap events from the TxId allocator.</summary>
|
||||
long TxIdWraps { get; }
|
||||
|
||||
/// <summary>Current depth of the outbound channel (frames queued for the backend writer).</summary>
|
||||
long BackendQueueDepth { get; }
|
||||
}
|
||||
Reference in New Issue
Block a user