fix: resolve code-review findings (locally verified)

Server-054/055/056, Contracts-020/021/022, Tests-036/038/039,
IntegrationTests-030/031/032 (+033 deferred to live rig),
Client.Dotnet-026/028/029 (+027 won't-fix), Client.Go-030..034,
Client.Python-032..036, Client.Rust-033..038.

Key fix: SessionEventDistributor orphaned a subscriber that registered after
the pump completed but before disposal (Server-056) -> register paths now
complete late registrants under _lifecycleLock; regression test added. The
racy dashboard-mirror gRPC test made deterministic (Tests-039).

Verified green locally: gateway Tests targeted classes (GatewaySession,
SessionEventDistributor, GatewayOptionsValidator, ProtobufContractRoundTrip,
GatewaySessionDashboardMirror) + dotnet/go/python/rust client suites.
This commit is contained in:
Joseph Doherty
2026-06-17 05:23:14 -04:00
parent 25d04ec37e
commit 6b5fe6aa82
37 changed files with 1049 additions and 211 deletions
@@ -25,6 +25,12 @@ public sealed class GatewaySession
private readonly TimeSpan _detachGrace;
private readonly TimeSpan _workerReadyWaitTimeout;
private DateTimeOffset? _detachedAtUtc;
// True once at least one external subscriber attached SUCCESSFULLY. Detach-grace's
// "last subscriber dropped" stamp (see DetachEventSubscriber) is gated on this so a
// FAILED first attach — which still runs the rollback DetachEventSubscriber from the
// attach catch path — does not push a never-subscribed session into the grace window
// (Server-055).
private bool _everHadEventSubscriber;
private SessionEventDistributor? _eventDistributor;
private bool _eventDistributorStarted;
private bool _dashboardMirrorStarted;
@@ -842,6 +848,7 @@ public sealed class GatewaySession
try
{
IEventSubscriberLease distributorLease = StartDistributorAndRegister();
MarkEventSubscriberAttached();
return new EventSubscriberLease(this, distributorLease);
}
catch
@@ -906,6 +913,7 @@ public sealed class GatewaySession
out ulong oldestAvailableSequence,
out ulong liveResumeSequence);
MarkEventSubscriberAttached();
return new EventSubscriberReplayAttachment(
new EventSubscriberLease(this, distributorLease),
replayedEvents,
@@ -920,6 +928,17 @@ public sealed class GatewaySession
}
}
// Records that an external subscriber attached successfully. Gates the detach-grace
// "last subscriber dropped" stamp so a FAILED first attach (which still rolls back via
// DetachEventSubscriber) never pushes a never-subscribed session into grace (Server-055).
private void MarkEventSubscriberAttached()
{
lock (_syncRoot)
{
_everHadEventSubscriber = true;
}
}
/// <summary>
/// Invokes a worker command synchronously and returns the reply.
/// </summary>
@@ -1862,7 +1881,12 @@ public sealed class GatewaySession
// Closing/Closed/Faulted there is nothing to retain. This is the detach→grace-start
// transition; it shares _syncRoot with the reattach→grace-cancel write above and the
// sweeper's IsDetachGraceExpired read, so the three serialize.
if (_detachGrace > TimeSpan.Zero
// Only stamp a detach that mirrors a prior SUCCESSFUL attach. The attach catch path
// calls this same method to roll back a reserved slot when the FIRST attach failed
// before any subscriber registered; that never-subscribed session must not enter the
// grace window (Server-055).
if (_everHadEventSubscriber
&& _detachGrace > TimeSpan.Zero
&& _activeEventSubscriberCount == 0
&& _state is not (SessionState.Closing or SessionState.Closed or SessionState.Faulted))
{
@@ -116,6 +116,17 @@ public sealed class SessionEventDistributor : IAsyncDisposable
private bool _started;
private bool _disposed;
// Set once the pump has run its final CompleteAllSubscribers sweep — the event source
// completed or faulted and the pump exited. Guarded by _lifecycleLock together with the
// subscriber add. A subscriber that registers AFTER this point but BEFORE DisposeAsync
// (the source ended but the session is not yet torn down) would otherwise be added with a
// channel the now-exited pump never completes, hanging its reader forever. The register
// paths complete such a late registrant's channel immediately with the same terminal
// state. _completionError carries the terminal exception (source fault) or null (graceful
// source completion), mirroring what the final CompleteAllSubscribers passed.
private bool _completed;
private Exception? _completionError;
/// <summary>
/// Initializes a per-session event distributor.
/// </summary>
@@ -304,6 +315,16 @@ public sealed class SessionEventDistributor : IAsyncDisposable
{
ObjectDisposedException.ThrowIf(_disposed, this);
_subscribers[subscriber.Id] = subscriber;
// Close the register-after-pump-completion window: if the pump already ran its
// final CompleteAllSubscribers (source completed/faulted) but the distributor is
// not yet disposed, no further completion sweep will run, so complete this late
// registrant's channel now with the same terminal state instead of leaving its
// reader hanging.
if (_completed)
{
subscriber.Channel.Writer.TryComplete(_completionError);
}
}
return new SubscriberLease(this, subscriber);
@@ -450,6 +471,14 @@ public sealed class SessionEventDistributor : IAsyncDisposable
{
ObjectDisposedException.ThrowIf(_disposed, this);
_subscribers[id] = subscriber;
// Same register-after-pump-completion guard as Register: a resume that races in
// after the source already ended still gets its retained replay batch (snapshot
// above), but its live channel must be completed now since the pump is gone.
if (_completed)
{
subscriber.Channel.Writer.TryComplete(_completionError);
}
}
}
@@ -628,9 +657,21 @@ public sealed class SessionEventDistributor : IAsyncDisposable
private void CompleteAllSubscribers(Exception? error)
{
foreach (Subscriber subscriber in _subscribers.Values)
// Record the terminal state AND complete the current subscribers under _lifecycleLock
// so this serializes with the subscriber-add in Register/RegisterWithReplay: a
// subscriber added before this runs is in the map and completed by the loop; one that
// races in afterward sees _completed and completes its own channel in the register
// path. Exactly one of the two completes each subscriber. TryComplete is non-blocking
// and (channels use AllowSynchronousContinuations=false) runs no continuation inline,
// so holding the lock across the loop cannot stall or re-enter.
lock (_lifecycleLock)
{
subscriber.Channel.Writer.TryComplete(error);
_completed = true;
_completionError = error;
foreach (Subscriber subscriber in _subscribers.Values)
{
subscriber.Channel.Writer.TryComplete(error);
}
}
}