feat(sessions): multi-subscriber cap enforcement + mode-gated FailFast

This commit is contained in:
Joseph Doherty
2026-06-15 15:32:08 -04:00
parent 7b12eebbd1
commit ac42783e36
12 changed files with 399 additions and 81 deletions
@@ -13,12 +13,16 @@ namespace ZB.MOM.WW.MxGateway.Server.Sessions;
/// regardless of what the handler does.
/// </summary>
/// <param name="isOnlySubscriber">
/// <see langword="true"/> when the overflowing subscriber is the sole registered
/// subscriber at the moment of overflow (legacy single-subscriber mode). FailFast faults
/// the session only in this case; with multiple subscribers FailFast degrades to a
/// per-subscriber disconnect so one slow consumer never faults a session shared by others.
/// Always <see langword="false"/> for internal subscribers (the dashboard mirror) because
/// <see cref="SessionEventDistributor"/> excludes them from the external-subscriber count.
/// <see langword="true"/> when FailFast is allowed to fault the whole session for this
/// overflow. As of Task 8 this is gated on the SESSION MODE, not a live count: it is
/// <see langword="true"/> only for an external subscriber in single-subscriber mode
/// (<c>AllowMultipleEventSubscribers == false</c>), where at most one external subscriber
/// can ever exist. In multi-subscriber mode it is always <see langword="false"/>, so
/// FailFast degrades to a per-subscriber disconnect and one slow consumer never faults a
/// session shared by others; gating on the fixed mode also removes the Task 5 race where a
/// concurrent registration could make a count snapshot falsely report a sole subscriber.
/// Always <see langword="false"/> for internal subscribers (the dashboard mirror) so a
/// slow/broken dashboard can never fault the session.
/// </param>
/// <param name="isInternal">
/// <see langword="true"/> when the overflowing subscriber is the gateway-owned internal
@@ -40,8 +44,10 @@ public delegate void SubscriberOverflowHandler(bool isOnlySubscriber, bool isInt
/// policy (Task 5) is implemented here: a slow subscriber overflows only its own
/// bounded channel and the pump applies the policy to that subscriber alone (see
/// <see cref="SubscriberOverflowHandler"/> and <c>OnSubscriberOverflow</c>), leaving
/// the pump, the session, and other subscribers running. The class does not yet
/// remove the single-subscriber guard (Tasks 7/8). The ring buffer supports capacity
/// the pump, the session, and other subscribers running. Task 8 made the
/// FailFast-faults-session decision mode-gated: it fires only in single-subscriber
/// mode (<c>singleSubscriberMode</c>), so multi-subscriber FailFast always degrades to
/// a per-subscriber disconnect — see <c>OnSubscriberOverflow</c>. The ring buffer supports capacity
/// eviction (oldest entry dropped when the count exceeds
/// <c>replayBufferCapacity</c>) and age eviction (entries older than
/// <c>replayRetentionSeconds</c> dropped on the next append or query), and is
@@ -83,6 +89,7 @@ public sealed class SessionEventDistributor : IAsyncDisposable
private readonly string _sessionId;
private readonly Func<CancellationToken, IAsyncEnumerable<MxEvent>> _eventSourceFactory;
private readonly int _subscriberQueueCapacity;
private readonly bool _singleSubscriberMode;
private readonly SubscriberOverflowHandler? _overflowHandler;
private readonly TimeSpan _shutdownTimeout;
private readonly ILogger<SessionEventDistributor> _logger;
@@ -134,7 +141,8 @@ public sealed class SessionEventDistributor : IAsyncDisposable
Func<CancellationToken, IAsyncEnumerable<MxEvent>> eventSourceFactory,
int subscriberQueueCapacity,
ILogger<SessionEventDistributor> logger,
SubscriberOverflowHandler? overflowHandler = null)
SubscriberOverflowHandler? overflowHandler = null,
bool singleSubscriberMode = true)
: this(
sessionId,
eventSourceFactory,
@@ -143,7 +151,8 @@ public sealed class SessionEventDistributor : IAsyncDisposable
replayRetentionSeconds: 0,
logger,
TimeProvider.System,
overflowHandler)
overflowHandler,
singleSubscriberMode)
{
}
@@ -181,6 +190,17 @@ public sealed class SessionEventDistributor : IAsyncDisposable
/// handler. When <see langword="null"/> (unit/skeleton use) the offending subscriber is
/// still disconnected but no metric/fault side effect runs.
/// </param>
/// <param name="singleSubscriberMode">
/// <see langword="true"/> when the owning session is in single-subscriber mode
/// (<c>AllowMultipleEventSubscribers == false</c>). This gates the FailFast
/// session-fault decision in <c>OnSubscriberOverflow</c>: an external subscriber that
/// overflows reports <c>isOnlySubscriber == true</c> (legacy FailFast faults the
/// session) ONLY in single-subscriber mode. In multi-subscriber mode it is always
/// <see langword="false"/>, so FailFast degrades to a per-subscriber disconnect and a
/// transient registration race can never falsely fault a shared session (Task 8;
/// resolves the Task 5 REVISIT race). Defaults to <see langword="true"/> so existing
/// call sites and unit tests keep legacy single-subscriber FailFast behavior.
/// </param>
public SessionEventDistributor(
string sessionId,
Func<CancellationToken, IAsyncEnumerable<MxEvent>> eventSourceFactory,
@@ -189,7 +209,8 @@ public sealed class SessionEventDistributor : IAsyncDisposable
double replayRetentionSeconds,
ILogger<SessionEventDistributor> logger,
TimeProvider timeProvider,
SubscriberOverflowHandler? overflowHandler = null)
SubscriberOverflowHandler? overflowHandler = null,
bool singleSubscriberMode = true)
{
ArgumentException.ThrowIfNullOrWhiteSpace(sessionId);
ArgumentNullException.ThrowIfNull(eventSourceFactory);
@@ -202,6 +223,7 @@ public sealed class SessionEventDistributor : IAsyncDisposable
_sessionId = sessionId;
_eventSourceFactory = eventSourceFactory;
_subscriberQueueCapacity = subscriberQueueCapacity;
_singleSubscriberMode = singleSubscriberMode;
_overflowHandler = overflowHandler;
_shutdownTimeout = DefaultShutdownTimeout;
_replayBufferCapacity = replayBufferCapacity;
@@ -416,28 +438,25 @@ public sealed class SessionEventDistributor : IAsyncDisposable
// slow consumer must not fault a session shared by other healthy subscribers.
private void OnSubscriberOverflow(Subscriber subscriber, ulong workerSequence)
{
// Snapshot whether this is the sole subscriber BEFORE we unregister it. This drives
// the FailFast-fault-session-vs-disconnect decision: FailFast only faults the session
// when the overflowing subscriber is the sole subscriber.
// Decide whether FailFast may fault the whole session for this overflow. This is the
// "isOnlySubscriber" signal the legacy single-subscriber FailFast path keys on.
//
// This snapshot is safe in v1 because AllowMultipleEventSubscribers=false is enforced
// by the validator and the single-subscriber guard in AttachEventSubscriber — a
// concurrent second registration is impossible, so the false-FailFast race (two
// subscribers, one overflows, Count reads as 1 after the other concurrently unregisters,
// FailFast wrongly faults the session) cannot occur today.
// Task 8 resolution of the Task 5/7 REVISIT race: gate this on the SESSION MODE
// (_singleSubscriberMode), NOT on a live count snapshot. The old
// `CountExternalSubscribers() == 1` snapshot raced once multi-subscriber became real —
// a concurrent second registration/unregistration could make the count read as 1 with
// two subscribers actually present, producing a false FailFast that faults a shared
// session. The mode is fixed for the session's lifetime, so reading it is race-free:
// - single-subscriber mode: at most one external subscriber can ever exist (the
// AttachEventSubscriber guard enforces it), so an overflowing external subscriber
// IS the sole subscriber — preserve the legacy FailFast session-fault behavior.
// - multi-subscriber mode: never fault the shared session; FailFast degrades to a
// per-subscriber disconnect so one slow consumer cannot punish healthy ones.
//
// REVISIT (Task 7/8): when multi-subscriber is enabled the guard is removed and the
// race window opens — a concurrent second registration could cause Count to read as 1
// here even with two subscribers, producing a false FailFast that faults a shared
// session. Resolve before enabling multi-subscriber.
//
// Task 6: the gateway-owned internal dashboard subscriber is excluded from this
// accounting. (a) An internal subscriber that overflows is NEVER the "only subscriber"
// — a slow/broken dashboard must never fault the session, only disconnect its own
// mirror. (b) Internal subscribers are excluded from the count, so a lone external
// gRPC subscriber still reports isOnlySubscriber==true and preserves the legacy
// FailFast session-fault behavior even while the dashboard mirror is attached.
bool isOnlySubscriber = !subscriber.IsInternal && CountExternalSubscribers() == 1;
// Task 6: the gateway-owned internal dashboard subscriber is excluded — an internal
// subscriber that overflows is NEVER the "only subscriber", so a slow/broken dashboard
// can only disconnect its own mirror and never fault the session.
bool isOnlySubscriber = !subscriber.IsInternal && _singleSubscriberMode;
_logger.LogDebug(
"Event distributor disconnecting subscriber {SubscriberId} in session {SessionId} after queue overflow (worker sequence {WorkerSequence}).",
@@ -473,22 +492,6 @@ public sealed class SessionEventDistributor : IAsyncDisposable
}
}
// Counts external (non-internal) subscribers. Drives the isOnlySubscriber FailFast
// decision so the gateway-owned internal dashboard subscriber never inflates the count.
private int CountExternalSubscribers()
{
int count = 0;
foreach (Subscriber subscriber in _subscribers.Values)
{
if (!subscriber.IsInternal)
{
count++;
}
}
return count;
}
private void CompleteAllSubscribers(Exception? error)
{
foreach (Subscriber subscriber in _subscribers.Values)