mbproxy: Wave 2 fixes from 2026-05-14 code review
Resolves the 21 Major findings catalogued in
codereviews/2026-05-14/RemediationPlan.md (Wave 2). Tests: 370 pass / 0 fail
(baseline 363 + 7 new W2 regression tests).
Multiplexer / concurrency:
W2.1 ConfigReconciler.Attach now threads the live coalescingAccessor through
to add/restart-built supervisors so a hot-reload of
ReadCoalescing.{Enabled,MaxParties} propagates to PLCs added or
restarted via reload.
W2.2 PlcMultiplexer._disposed and UpstreamPipe._disposed are now volatile
for ARM/portability defense.
W2.3 ProxyWorker._supervisors / ConfigReconciler._supervisors switched from
Dictionary to ConcurrentDictionary; reconciler uses TryRemove. The
outer Apply is serialised by a semaphore but the inner Add/Remove/
Restart Task.WhenAll continuations run in parallel.
W2.4 Counter parity for cache miss + coalescing-saturation miss documented
inline (per-design contract; behavior unchanged).
W2.5 _disposeCts.Dispose() and _connectGate.Dispose() guarded against late
watchdog ticks.
W2.6 _connectGate disposed in DisposeAsync.
W2.7 Inline doc clarifying the post-rewriter FC byte read.
Cache / hot-reload:
W2.8 PlcListenerSupervisor.ReplaceContextAsync now calls Clear() to capture
the entry count, emits mbproxy.cache.flushed, then disposes the old
cache. Previously the event was defined but never emitted.
W2.9 Inline doc explaining the implicit "skip cache invalidation while
recovering" gating (no backend reader during recovery → no FC06/FC16
response → no invalidation).
W2.10 ReloadValidator now re-checks resolved per-tag CacheTtlMs against
Cache.AllowLongTtl after BcdTagMapBuilder folds the per-PLC default.
BCD rewriter:
W2.11 Duplicate addresses detected within Global itself and within the per-PLC
Add list itself, BEFORE the working dictionary collapses keys. Cross-list
collisions (Global vs Add) remain the documented width-override pattern.
Previously the DuplicateAddress error was unreachable dead code.
W2.12 OverlappingHighRegister reports each colliding pair exactly once
(canonicalised low/high pair tracked in a HashSet).
W2.13 FC16 32-bit write rejects clientLow > 9999 or clientHigh > 9999 BEFORE
the high*10000+low reconstruction. Without this guard, (high=9999,
low=9999) silently re-encoded as (high=9998, low=9999), losing 1 from
the high word.
W2.14 FC16 validates pdu.Length >= 6 + qty*2 upfront — no half-rewritten
requests when a malformed client claims more registers than it ships.
Supervisor:
W2.15 WaitForInitialBindAttemptAsync now backed by TaskCompletionSource
instead of 10ms busy-poll. Resolves race against fast Stopped→Bound→
Stopped transitions and hangs when the supervisor task throws.
W2.16 StartAsync refuses re-entry on a non-Stopped supervisor (was leaking
the previous _supervisorCts).
W2.17 New TransitionTo helper writes _state, _lastBindError, and (optionally)
_recoveryAttempts under one lock. Snapshot() reads under the same lock
so the status page never reports an inconsistent triple. Truncate
helper extracted (was copy-pasted across three sites).
W2.18 MbproxyOptionsValidator + ReloadValidator reject Connection.{Backend
ConnectTimeoutMs, BackendRequestTimeoutMs, GracefulShutdownTimeoutMs}
<= 0. Misconfigured 0 produces immediate CancelAfter(0) failures.
Hosting / diagnostics:
W2.20 ProxyWorker.StopAsync supervisor-stop deadline now reads from
IOptionsMonitor.CurrentValue.Connection.GracefulShutdownTimeoutMs
(was hard-coded 5s).
W2.21 src/Mbproxy/appsettings.json deleted; the published file is now a Link
to install/mbproxy.config.template.json so the binary ships with a
usable, fully-commented example config instead of an empty stub. Tests
strip the inherited file from their bin via an AfterTargets="Build"
Target so they don't pick up the template's example PLCs.
W2.22 invalidBcdWarnings (PlcPdusStatus) and codeOther (ExceptionCounts)
added to StatusDto, plumbed through StatusSnapshotBuilder, surfaced
in StatusHtmlRenderer table cells.
W2.23 EventLogBridge caches EventLog.SourceExists at construction so Emit
doesn't hit the registry on every Error+ log line.
New regression tests:
ReloadValidatorTests:
Validate_PerTagCacheTtl_Above60s_Without_AllowLongTtl_Fails
Validate_PerTagCacheTtl_Above60s_With_AllowLongTtl_Passes
Validate_ResolvedTtl_FromPerPlcDefault_AboveCap_Fails
Validate_ZeroBackendConnectTimeoutMs_Fails
Validate_NegativeGracefulShutdownTimeoutMs_Fails
BcdPduPipelineTests:
FC16_32Bit_ClientHighOrLowAbove9999_PassesThroughRaw_WithInvalidBcdWarning
FC16_TruncatedRegisterData_PassesThroughRaw_NoPartialRewrite
Reworked tests in BcdTagMapBuilderTests for the W2.11 contract (Global dup,
Add dup, Add-overrides-Global accepted as width override).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1,4 +1,5 @@
|
||||
using Mbproxy.Options;
|
||||
using Mbproxy.Proxy.Cache;
|
||||
using Mbproxy.Proxy.Multiplexing;
|
||||
using Polly;
|
||||
|
||||
@@ -66,6 +67,13 @@ internal sealed partial class PlcListenerSupervisor : IAsyncDisposable
|
||||
|
||||
private bool _disposed;
|
||||
|
||||
// Phase 12 (W2.15) — completes when the supervisor has transitioned out of Stopped
|
||||
// for the first time (reached Bound or Recovering). Replaces the previous busy-poll
|
||||
// implementation in WaitForInitialBindAttemptAsync, which raced fast Stopped→Bound→
|
||||
// Stopped transitions and never exited if the supervisor task threw inside Polly.
|
||||
private readonly TaskCompletionSource _firstAttemptCompleted = new(
|
||||
TaskCreationOptions.RunContinuationsAsynchronously);
|
||||
|
||||
// ── Public surface ────────────────────────────────────────────────────────────────────
|
||||
|
||||
public string PlcName => _plc.Name;
|
||||
@@ -123,6 +131,16 @@ internal sealed partial class PlcListenerSupervisor : IAsyncDisposable
|
||||
/// </summary>
|
||||
public Task StartAsync(CancellationToken ct)
|
||||
{
|
||||
// Phase 12 (W2.16) — refuse to re-Start an already-running or already-disposed
|
||||
// supervisor. The original code reassigned _supervisorCts unconditionally, which
|
||||
// leaked the previous CTS and could leave a zombie task running against an
|
||||
// unobserved token. The supervisor's state machine has exactly one Start.
|
||||
if (_disposed)
|
||||
throw new ObjectDisposedException(nameof(PlcListenerSupervisor));
|
||||
if (_state != SupervisorState.Stopped || !_supervisorTask.IsCompleted)
|
||||
throw new InvalidOperationException(
|
||||
$"Supervisor for Plc='{_plc.Name}' has already been started.");
|
||||
|
||||
_supervisorCts = CancellationTokenSource.CreateLinkedTokenSource(ct);
|
||||
_supervisorTask = Task.Run(() => RunSupervisorAsync(_supervisorCts.Token), CancellationToken.None);
|
||||
return Task.CompletedTask;
|
||||
@@ -133,13 +151,22 @@ internal sealed partial class PlcListenerSupervisor : IAsyncDisposable
|
||||
/// (transitioned to <see cref="SupervisorState.Bound"/> or
|
||||
/// <see cref="SupervisorState.Recovering"/>).
|
||||
/// Returns immediately if the supervisor is already past that point.
|
||||
///
|
||||
/// <para><b>Phase 12 (W2.15)</b> — backed by a <see cref="TaskCompletionSource"/> set
|
||||
/// when the supervisor task first transitions out of <see cref="SupervisorState.Stopped"/>.
|
||||
/// Replaces the previous 10 ms busy-poll which raced fast bind+stop sequences and could
|
||||
/// hang if the supervisor task threw before any state write happened.</para>
|
||||
/// </summary>
|
||||
public async Task WaitForInitialBindAttemptAsync(CancellationToken ct)
|
||||
{
|
||||
while (_state == SupervisorState.Stopped && !ct.IsCancellationRequested
|
||||
&& !_supervisorTask.IsCompleted)
|
||||
if (_firstAttemptCompleted.Task.IsCompleted) return;
|
||||
try
|
||||
{
|
||||
await Task.Delay(10, ct).ConfigureAwait(false);
|
||||
await _firstAttemptCompleted.Task.WaitAsync(ct).ConfigureAwait(false);
|
||||
}
|
||||
catch (OperationCanceledException)
|
||||
{
|
||||
// Caller cancelled; not a fault — same observable behaviour as the prior poll.
|
||||
}
|
||||
}
|
||||
|
||||
@@ -173,11 +200,43 @@ internal sealed partial class PlcListenerSupervisor : IAsyncDisposable
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>Returns a point-in-time snapshot of this supervisor's state.</summary>
|
||||
public SupervisorSnapshot Snapshot() => new(
|
||||
State: _state,
|
||||
LastBindError: _lastBindError,
|
||||
RecoveryAttempts: Interlocked.CompareExchange(ref _recoveryAttempts, 0, 0));
|
||||
/// <summary>
|
||||
/// Returns a point-in-time snapshot of this supervisor's state.
|
||||
///
|
||||
/// <para><b>Phase 12 (W2.17)</b> — reads the three observable fields under a single
|
||||
/// lock so the status page can never report inconsistent triples like
|
||||
/// <c>(State=Bound, LastBindError=<previous>, RecoveryAttempts>0)</c>. The
|
||||
/// supervisor task uses <see cref="TransitionTo"/> which takes the same lock, so a
|
||||
/// snapshot reads a transition-consistent view.</para>
|
||||
/// </summary>
|
||||
public SupervisorSnapshot Snapshot()
|
||||
{
|
||||
lock (_snapshotLock)
|
||||
{
|
||||
return new SupervisorSnapshot(
|
||||
State: _state,
|
||||
LastBindError: _lastBindError,
|
||||
RecoveryAttempts: _recoveryAttempts);
|
||||
}
|
||||
}
|
||||
|
||||
private readonly object _snapshotLock = new();
|
||||
|
||||
/// <summary>
|
||||
/// Phase 12 (W2.17) — atomic three-field transition. State, lastBindError, and
|
||||
/// (optionally) the recoveryAttempts increment all happen under one lock so a
|
||||
/// concurrent <see cref="Snapshot"/> never sees a half-applied transition.
|
||||
/// </summary>
|
||||
private void TransitionTo(SupervisorState newState, string? lastBindError, bool incrementRecoveryAttempt)
|
||||
{
|
||||
lock (_snapshotLock)
|
||||
{
|
||||
_state = newState;
|
||||
_lastBindError = lastBindError;
|
||||
if (incrementRecoveryAttempt)
|
||||
_recoveryAttempts++;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Atomically swaps the per-PLC context (tag map + optional response cache) on the
|
||||
@@ -210,12 +269,16 @@ internal sealed partial class PlcListenerSupervisor : IAsyncDisposable
|
||||
// inside the Polly loop will pick up newCtx through _currentContext above.
|
||||
_currentListener?.Multiplexer?.ReplaceContext(newCtx);
|
||||
|
||||
// Phase 12 (W1.1 + W2.8 prereq) — drop the outgoing cache AFTER the swap so the
|
||||
// running multiplexer can no longer reach it. Dispose stops the eviction loop and
|
||||
// releases the timer. (The cache.flushed log event is W2.8 work; this Wave-1 fix
|
||||
// is the "no longer in use, safe to drop" piece.)
|
||||
// Phase 12 (W1.1 + W2.8) — drop the outgoing cache AFTER the swap so the running
|
||||
// multiplexer can no longer reach it. Clear() snapshots the entry count for the
|
||||
// mbproxy.cache.flushed log event before disposing the cache (which stops the
|
||||
// eviction loop and releases the timer).
|
||||
if (oldCache is not null && !ReferenceEquals(oldCache, newCtx.Cache))
|
||||
{
|
||||
int dropped = oldCache.Clear();
|
||||
CacheLogEvents.Flushed(_logger, _plc.Name, "tag-list-reload", dropped);
|
||||
oldCache.Dispose();
|
||||
}
|
||||
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
@@ -268,11 +331,10 @@ internal sealed partial class PlcListenerSupervisor : IAsyncDisposable
|
||||
_currentListener = null;
|
||||
await listener.DisposeAsync().ConfigureAwait(false);
|
||||
|
||||
Interlocked.Increment(ref _recoveryAttempts);
|
||||
string reason = bindEx.Message;
|
||||
string truncated = reason.Length > 256 ? reason[..256] : reason;
|
||||
_lastBindError = truncated;
|
||||
_state = SupervisorState.Recovering;
|
||||
string truncated = Truncate(bindEx.Message, 256);
|
||||
TransitionTo(SupervisorState.Recovering, truncated, incrementRecoveryAttempt: true);
|
||||
// Phase 12 (W2.15) — signal the first transition out of Stopped.
|
||||
_firstAttemptCompleted.TrySetResult();
|
||||
|
||||
// Also update the per-PLC counters if available (Phase 07 reads these).
|
||||
_currentContext?.Counters.IncrementRecoveryAttempt(truncated);
|
||||
@@ -297,9 +359,10 @@ internal sealed partial class PlcListenerSupervisor : IAsyncDisposable
|
||||
}
|
||||
|
||||
// Clear the last bind error on a successful bind.
|
||||
_lastBindError = null;
|
||||
TransitionTo(SupervisorState.Bound, lastBindError: null, incrementRecoveryAttempt: false);
|
||||
_currentContext?.Counters.ClearLastBindError();
|
||||
_state = SupervisorState.Bound;
|
||||
// Phase 12 (W2.15) — signal the first transition out of Stopped.
|
||||
_firstAttemptCompleted.TrySetResult();
|
||||
|
||||
// ── Run the accept loop ──────────────────────────────────────────
|
||||
// RunAsync returns when: (a) token is cancelled (normal shutdown),
|
||||
@@ -324,10 +387,12 @@ internal sealed partial class PlcListenerSupervisor : IAsyncDisposable
|
||||
_currentListener = null;
|
||||
await listener.DisposeAsync().ConfigureAwait(false);
|
||||
|
||||
Interlocked.Increment(ref _recoveryAttempts);
|
||||
string truncated = runEx.Message.Length > 256 ? runEx.Message[..256] : runEx.Message;
|
||||
_lastBindError = truncated;
|
||||
_state = SupervisorState.Recovering;
|
||||
string truncated = Truncate(runEx.Message, 256);
|
||||
TransitionTo(SupervisorState.Recovering, truncated, incrementRecoveryAttempt: true);
|
||||
// Phase 12 (W2.15) — also signal first-attempt-completed in case the
|
||||
// very first listener.RunAsync faulted before the bind-success path
|
||||
// signalled it.
|
||||
_firstAttemptCompleted.TrySetResult();
|
||||
|
||||
// Also update the per-PLC counters if available.
|
||||
_currentContext?.Counters.IncrementRecoveryAttempt(truncated);
|
||||
@@ -346,10 +411,8 @@ internal sealed partial class PlcListenerSupervisor : IAsyncDisposable
|
||||
|
||||
// Otherwise (listener closed without cancellation — e.g., OS event),
|
||||
// treat as a fault and re-enter recovery.
|
||||
Interlocked.Increment(ref _recoveryAttempts);
|
||||
const string unexpectedEnd = "Listener accept loop ended unexpectedly";
|
||||
_lastBindError = unexpectedEnd;
|
||||
_state = SupervisorState.Recovering;
|
||||
TransitionTo(SupervisorState.Recovering, unexpectedEnd, incrementRecoveryAttempt: true);
|
||||
_currentContext?.Counters.IncrementRecoveryAttempt(unexpectedEnd);
|
||||
LogListenerEnded(_logger, _plc.Name, _plc.ListenPort);
|
||||
throw new InvalidOperationException(unexpectedEnd);
|
||||
@@ -369,11 +432,26 @@ internal sealed partial class PlcListenerSupervisor : IAsyncDisposable
|
||||
}
|
||||
finally
|
||||
{
|
||||
_state = SupervisorState.Stopped;
|
||||
// Snapshot consistency: state goes back to Stopped without changing the last
|
||||
// bind error so operators can still see WHY the supervisor exited.
|
||||
lock (_snapshotLock)
|
||||
{
|
||||
_state = SupervisorState.Stopped;
|
||||
}
|
||||
_currentListener = null;
|
||||
// Phase 12 (W2.15) — defensive: if RunSupervisorAsync exits before any bind
|
||||
// attempt fired (e.g. construction-time fault), unblock any awaiting
|
||||
// WaitForInitialBindAttemptAsync caller so it doesn't hang.
|
||||
_firstAttemptCompleted.TrySetResult();
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Phase 12 (W2 cleanup) — single helper for the truncate-exception-message pattern
|
||||
/// previously copy-pasted across three call sites.
|
||||
/// </summary>
|
||||
private static string Truncate(string s, int max) => s.Length > max ? s[..max] : s;
|
||||
|
||||
// ── IAsyncDisposable ─────────────────────────────────────────────────────────────────
|
||||
|
||||
public async ValueTask DisposeAsync()
|
||||
|
||||
Reference in New Issue
Block a user