diff --git a/mbproxy/codereviews/2026-05-14/ReReviewAfterRemediation.md b/mbproxy/codereviews/2026-05-14/ReReviewAfterRemediation.md index 2f12f58..c8fd582 100644 --- a/mbproxy/codereviews/2026-05-14/ReReviewAfterRemediation.md +++ b/mbproxy/codereviews/2026-05-14/ReReviewAfterRemediation.md @@ -6,9 +6,9 @@ Re-review of the codebase after the six-commit remediation of the original 2026- ## Status -> **All actionable findings resolved.** Wave 4 (`7a43595`) closed NC1 + NM1 + NM2 + NM5 + Nm1 + T2. Wave 4-followup (this commit) closed NM3 + NM4 + Nm6 + Nm7 + T3 + T4. Remaining items are **Accepted** with rationale (explained below) — they're deliberate trade-offs or doc-only nuances that don't warrant code changes. +> **All actionable findings resolved across two re-review passes.** Wave 4 (`7a43595`) closed NC1 + NM1 + NM2 + NM5 + Nm1 + T2. Wave 4-followup (`9251c56`) closed NM3 + NM4 + Nm6 + Nm7 + T3 + T4. A third focused pass surfaced one more major (W5/M1) and two cosmetics (W5/m1, W5/m2); Wave 5 (this commit) resolved M1 + m2 and documented m1 as accepted best-effort. > -> **Final test count:** 387 pass / 0 fail. Race tests stable across 3 isolated runs. +> **Final test count:** 387 pass / 0 fail. ## Headline @@ -43,6 +43,25 @@ The remediation was structurally sound. The re-review found: **Resolved: 13/18. Accepted: 5/18.** +## Third pass — final findings (Wave 5) + +A third focused review pass on the post-W4-followup state turned up these additional items: + +| ID | Severity | Finding | Status | Commit | +|----|----------|---------|--------|--------| +| **W5/M1** | Major | `AdminEndpointHost` `OnChange` callback can resurrect a Kestrel app after `StopAsync` returned (no `_disposed` check inside the fire-and-forget Task.Run lambda) | ✅ **Resolved** | (W5) | +| **W5/m1** | Minor | `TearDownBackendAsync` gate-not-held path: a concurrent freshly-allocated TxId can collide with one being released by the channel drain → silent request drop. Probability very low (gate timeout AND new accept AND TxId collision in 65,536-slot space). | ⚪ **Accepted** | (W5 — inline doc comment in `PlcMultiplexer.cs`) | +| **W5/m2** | Minor | `inFlightAtCancel` was computed AFTER `base.StopAsync` — narrower window than the field name promises | ✅ **Resolved** | (W5) | +| **W5/m3** | Cosmetic | `CountInFlight` allocates a 35-field `CounterSnapshot` record per supervisor on shutdown | ⚪ **Accepted** (skip) | — | + +**W5/M1 fix detail.** Added `if (_disposed) return;` at the top of the `OnChange` lambda AND inside the queued `Task.Run`, plus `try/catch (ObjectDisposedException)` around `_lock.WaitAsync` and `_lock.Release()` so a hot-reload of `AdminPort` during shutdown can no longer resurrect a fresh Kestrel WebApplication on the new port after the host considered admin shut down. + +**W5/m2 fix detail.** Moved `int inFlightAtCancel = CountInFlight();` to BEFORE `await base.StopAsync(cancellationToken)`. Now the count actually reflects "in-flight at the moment the host signalled stop" — not "in-flight at the moment we got around to computing it after the cancel propagated." + +**W5/m1 acceptance.** Documented inline at `PlcMultiplexer.cs:TearDownBackendAsync` near the `gateHeld` flag declaration. The race requires three coincidences (gate-timeout + new accept landing during cascade + TxId collision); the only consequence is one dropped request that the client retries on its next attempt. + +**W5/m3 skip.** Trivial per-PLC allocation (~5 KB on a 54-PLC fleet, called once per shutdown). Optimising it would require exposing a single-field accessor on `ProxyCounters`; not worth the surface change. + --- ## Resolved findings — what landed @@ -126,4 +145,4 @@ The original re-review listed the following as verified clean by inspection duri ## Closed -The 2026-05-14 review series — original review → 4 remediation waves → re-review → wave-4 → wave-4-followup — is now closed. Tests: 387 pass / 0 fail. Three back-to-back race-test runs in isolation all green. Every actionable finding resolved or explicitly accepted with rationale. +The 2026-05-14 review series — original review → 4 remediation waves → first re-review → wave 4 + followup → second re-review → wave 5 — is now closed. Tests: 387 pass / 0 fail. Three back-to-back race-test runs in isolation all green. Every actionable finding resolved or explicitly accepted with rationale. diff --git a/mbproxy/src/Mbproxy/Admin/AdminEndpointHost.cs b/mbproxy/src/Mbproxy/Admin/AdminEndpointHost.cs index 068e79c..3e2f03e 100644 --- a/mbproxy/src/Mbproxy/Admin/AdminEndpointHost.cs +++ b/mbproxy/src/Mbproxy/Admin/AdminEndpointHost.cs @@ -77,15 +77,35 @@ internal sealed partial class AdminEndpointHost : IAsyncDisposable // Subscribe to config changes: if AdminPort changes, re-bind. _optionsChangeRegistration = _optionsMonitor.OnChange(opts => { + // Phase 12 (W5 / M1) — short-circuit if disposal has already started. The + // OnChange callback can fire (and the Task.Run can be queued) AFTER StopAsync + // disposed the change registration but BEFORE DI ran DisposeAsync; without + // this guard the lambda would resurrect a fresh Kestrel app on the new port + // after the host already considered admin shut down. + if (_disposed) return; + int newPort = opts.AdminPort; if (newPort == _currentPort) return; // Only care about AdminPort changes. // Fire-and-forget: re-bind is async; we can't await in OnChange. _ = Task.Run(async () => { - await _lock.WaitAsync().ConfigureAwait(false); + // Re-check after the queue: a concurrent StopAsync/DisposeAsync may have + // landed between OnChange firing and the threadpool picking us up. + if (_disposed) return; + try { + await _lock.WaitAsync().ConfigureAwait(false); + } + catch (ObjectDisposedException) + { + // _lock disposed mid-queue — host is shutting down. Drop silently. + return; + } + try + { + if (_disposed) return; if (newPort == _currentPort) return; // double-check under lock // Stop the old app. @@ -98,7 +118,8 @@ internal sealed partial class AdminEndpointHost : IAsyncDisposable } finally { - _lock.Release(); + try { _lock.Release(); } + catch (ObjectDisposedException) { /* dispose race */ } } }); }); diff --git a/mbproxy/src/Mbproxy/Proxy/Multiplexing/PlcMultiplexer.cs b/mbproxy/src/Mbproxy/Proxy/Multiplexing/PlcMultiplexer.cs index 687094c..29b8551 100644 --- a/mbproxy/src/Mbproxy/Proxy/Multiplexing/PlcMultiplexer.cs +++ b/mbproxy/src/Mbproxy/Proxy/Multiplexing/PlcMultiplexer.cs @@ -348,6 +348,18 @@ internal sealed class PlcMultiplexer : IAsyncDisposable, IMultiplexCountersProvi // bounds disposal latency; if the gate is unavailable we proceed best-effort // without it (the worst-case consequence is one orphaned in-flight cycle on the // dying backend, which the upstream watchdog will surface as exception 0x0B). + // + // Phase 12 (W5 / m1) — KNOWN RACE on the gate-not-held path: a concurrent + // EnsureBackendConnectedAsync that DOES hold the gate may TryAllocate a TxId + // that collides (after wraparound in the allocator's forward scan) with a TxId + // we're about to release from the channel-drain step below. The double-release + // would mark the new request's slot as free even though it's legitimately + // in-flight, allowing the next allocation to reuse the same slot and + // CorrelationMap.TryAdd to fail (silent request drop). Probability is very low + // (requires gate timeout + new accept landing during cascade + TxId collision in + // a 65,536-slot space); the only consequence is one dropped request that the + // client retries. Documented as accepted best-effort behaviour in + // codereviews/2026-05-14/ReReviewAfterRemediation.md (m1). bool gateHeld = false; try { diff --git a/mbproxy/src/Mbproxy/Proxy/ProxyWorker.cs b/mbproxy/src/Mbproxy/Proxy/ProxyWorker.cs index 6bf62be..6564155 100644 --- a/mbproxy/src/Mbproxy/Proxy/ProxyWorker.cs +++ b/mbproxy/src/Mbproxy/Proxy/ProxyWorker.cs @@ -277,18 +277,23 @@ internal sealed partial class ProxyWorker : BackgroundService /// public override async Task StopAsync(CancellationToken cancellationToken) { + // Phase 12 (W5 / m2) — snapshot in-flight BEFORE base.StopAsync so the field + // matches its name: "the count at the moment the host signalled stop", not "the + // count at the moment we got around to computing it." `base.StopAsync` cancels the + // ExecuteAsync stoppingToken; in the milliseconds before it returns, in-flight + // requests whose responses arrive will be removed from _correlation and the + // watchdog can clear stale entries — the count would otherwise drift downward. + // + // Phase 12 (W4 / NC1) — must run BEFORE supervisor stop too: after + // supervisor.StopAsync, multiplexers are disposed and CountInFlight returns 0 + // unconditionally (the original ShutdownCoordinator had the same defect). + int inFlightAtCancel = CountInFlight(); + // Cancel ExecuteAsync first. await base.StopAsync(cancellationToken).ConfigureAwait(false); var sw = Stopwatch.StartNew(); - // Phase 12 (W4 / NC1) — snapshot in-flight count BEFORE supervisor stop. After - // supervisor.StopAsync, multiplexers are disposed and CountInFlight returns 0 - // unconditionally; reading after the stop produced a meaningless always-zero log - // (the original ShutdownCoordinator had the same defect — see - // codereviews/2026-05-14/ReReviewAfterRemediation.md NC1). - int inFlightAtCancel = CountInFlight(); - // Phase 12 (W2.20) — supervisor stop deadline read from the live config so a // hot-reloaded GracefulShutdownTimeoutMs is honoured. Supervisor stop is the // drain: cancelling the supervisor cancels the listener, which exits accept, which