feat(sessions): per-subscriber backpressure isolation in SessionEventDistributor

This commit is contained in:
Joseph Doherty
2026-06-15 13:39:25 -04:00
parent 61627fc5b0
commit 039111ca05
9 changed files with 308 additions and 66 deletions
@@ -4,6 +4,22 @@ using ZB.MOM.WW.MxGateway.Contracts.Proto;
namespace ZB.MOM.WW.MxGateway.Server.Sessions;
/// <summary>
/// Invoked by the pump (on the pump thread) when a subscriber's bounded channel is full
/// and the event cannot be written. The handler applies the per-subscriber backpressure
/// policy: it records the overflow metric and, in the legacy single-subscriber FailFast
/// case, faults the owning session. It does NOT complete the subscriber's channel — the
/// distributor always disconnects the offending subscriber with an overflow fault — so
/// the handler is purely observability plus the session-fault decision.
/// </summary>
/// <param name="isOnlySubscriber">
/// <see langword="true"/> when the overflowing subscriber is the sole registered
/// subscriber at the moment of overflow (legacy single-subscriber mode). FailFast faults
/// the session only in this case; with multiple subscribers FailFast degrades to a
/// per-subscriber disconnect so one slow consumer never faults a session shared by others.
/// </param>
public delegate void SubscriberOverflowHandler(bool isOnlySubscriber);
/// <summary>
/// Per-session event pump and fan-out. A single background task drains the
/// session's event source <em>exactly once</em> and fans each event out to
@@ -12,10 +28,13 @@ namespace ZB.MOM.WW.MxGateway.Server.Sessions;
/// <remarks>
/// <para>
/// Introduced by Task 2 of the Session Resilience epic; the bounded replay ring
/// buffer was added by Task 3. The class is NOT yet wired into
/// <c>GatewaySession</c> or <c>EventStreamService</c> (Task 4), has no
/// per-subscriber backpressure-isolation policy (Task 5), and does not remove
/// the single-subscriber guard (Tasks 7/8). The ring buffer supports capacity
/// buffer was added by Task 3, it was wired into <c>GatewaySession</c> and
/// <c>EventStreamService</c> by Task 4, and the per-subscriber backpressure-isolation
/// policy (Task 5) is implemented here: a slow subscriber overflows only its own
/// bounded channel and the pump applies the policy to that subscriber alone (see
/// <see cref="SubscriberOverflowHandler"/> and <c>OnSubscriberOverflow</c>), leaving
/// the pump, the session, and other subscribers running. The class does not yet
/// remove the single-subscriber guard (Tasks 7/8). The ring buffer supports capacity
/// eviction (oldest entry dropped when the count exceeds
/// <c>replayBufferCapacity</c>) and age eviction (entries older than
/// <c>replayRetentionSeconds</c> dropped on the next append or query), and is
@@ -57,6 +76,7 @@ public sealed class SessionEventDistributor : IAsyncDisposable
private readonly string _sessionId;
private readonly Func<CancellationToken, IAsyncEnumerable<MxEvent>> _eventSourceFactory;
private readonly int _subscriberQueueCapacity;
private readonly SubscriberOverflowHandler? _overflowHandler;
private readonly TimeSpan _shutdownTimeout;
private readonly ILogger<SessionEventDistributor> _logger;
private readonly TimeProvider _timeProvider;
@@ -106,7 +126,8 @@ public sealed class SessionEventDistributor : IAsyncDisposable
string sessionId,
Func<CancellationToken, IAsyncEnumerable<MxEvent>> eventSourceFactory,
int subscriberQueueCapacity,
ILogger<SessionEventDistributor> logger)
ILogger<SessionEventDistributor> logger,
SubscriberOverflowHandler? overflowHandler = null)
: this(
sessionId,
eventSourceFactory,
@@ -114,7 +135,8 @@ public sealed class SessionEventDistributor : IAsyncDisposable
replayBufferCapacity: 0,
replayRetentionSeconds: 0,
logger,
TimeProvider.System)
TimeProvider.System,
overflowHandler)
{
}
@@ -144,6 +166,14 @@ public sealed class SessionEventDistributor : IAsyncDisposable
/// Clock used to timestamp and age-evict replay entries. Inject a fake to make
/// age-eviction deterministic in tests.
/// </param>
/// <param name="overflowHandler">
/// Optional per-subscriber backpressure handler invoked when a subscriber's bounded
/// channel is full. It records the overflow metric and, for the legacy
/// single-subscriber FailFast case, faults the owning session. The distributor always
/// disconnects the offending subscriber with an overflow fault regardless of the
/// handler. When <see langword="null"/> (unit/skeleton use) the offending subscriber is
/// still disconnected but no metric/fault side effect runs.
/// </param>
public SessionEventDistributor(
string sessionId,
Func<CancellationToken, IAsyncEnumerable<MxEvent>> eventSourceFactory,
@@ -151,7 +181,8 @@ public sealed class SessionEventDistributor : IAsyncDisposable
int replayBufferCapacity,
double replayRetentionSeconds,
ILogger<SessionEventDistributor> logger,
TimeProvider timeProvider)
TimeProvider timeProvider,
SubscriberOverflowHandler? overflowHandler = null)
{
ArgumentException.ThrowIfNullOrWhiteSpace(sessionId);
ArgumentNullException.ThrowIfNull(eventSourceFactory);
@@ -164,6 +195,7 @@ public sealed class SessionEventDistributor : IAsyncDisposable
_sessionId = sessionId;
_eventSourceFactory = eventSourceFactory;
_subscriberQueueCapacity = subscriberQueueCapacity;
_overflowHandler = overflowHandler;
_shutdownTimeout = DefaultShutdownTimeout;
_replayBufferCapacity = replayBufferCapacity;
_ageEvictionEnabled = replayRetentionSeconds > 0;
@@ -214,11 +246,15 @@ public sealed class SessionEventDistributor : IAsyncDisposable
// (one gRPC stream / dashboard subscriber). Synchronous continuations are
// disabled so a slow reader can never stall the pump on its completion.
//
// FullMode is Wait but the pump currently writes with TryWrite (drop-on-full):
// these are deliberately opposite policies and only a placeholder. Task 5 owns
// the overflow policy and will reconcile them by either switching the pump to
// WriteAsync (true backpressure, honouring Wait) or changing this to a Drop mode.
// Do not "fix" the mismatch here — leave the decision to Task 5.
// The pump MUST stay non-blocking: it writes with the non-blocking TryWrite so one
// slow reader can never stall the single pump that feeds every subscriber. FullMode
// is deliberately Wait — NOT because the pump ever blocks (it never calls the blocking
// WriteAsync overload), but because Wait is the only BoundedChannelFullMode under
// which TryWrite returns false when the channel is full. That false return IS the
// overflow signal the pump needs to apply the per-subscriber backpressure policy. The
// Drop* modes would make TryWrite silently succeed-and-drop, hiding overflow and
// re-introducing the silent data loss this task removes. So: Wait mode + TryWrite =
// a non-blocking pump that still detects a full subscriber channel.
Channel<MxEvent> channel = Channel.CreateBounded<MxEvent>(
new BoundedChannelOptions(_subscriberQueueCapacity)
{
@@ -321,20 +357,15 @@ public sealed class SessionEventDistributor : IAsyncDisposable
// which matches "late subscribers see events after they register".
foreach (Subscriber subscriber in _subscribers.Values)
{
// TODO(Task 5): define overflow policy (per-subscriber isolation —
// drop / disconnect / fault that one subscriber). For the Task 2
// skeleton, a non-blocking TryWrite that silently drops on a full
// channel is the placeholder so one slow reader never stalls the pump.
// Non-blocking write: TryWrite never blocks the pump on a slow reader.
// A false return means this subscriber's bounded channel is full — the
// per-subscriber overflow signal. We apply the backpressure policy to
// THIS subscriber only; the pump, the session, and every other subscriber
// keep running. Logs identifiers (worker sequence, subscriber id, session)
// only, never the event payload or tag values.
if (!subscriber.Channel.Writer.TryWrite(mxEvent))
{
// Visibility only — Task 5 owns the actual drop/backpressure policy.
// Logs identifiers (worker sequence, subscriber id, session) only,
// never the event payload or tag values.
_logger.LogDebug(
"Event distributor dropped event (worker sequence {WorkerSequence}) for subscriber {SubscriberId} in session {SessionId}: channel full.",
mxEvent.WorkerSequence,
subscriber.Id,
_sessionId);
OnSubscriberOverflow(subscriber, mxEvent.WorkerSequence);
}
}
}
@@ -357,6 +388,52 @@ public sealed class SessionEventDistributor : IAsyncDisposable
}
}
// Applies the per-subscriber backpressure policy when a subscriber's bounded channel is
// full. Runs on the pump thread. The offending subscriber is ALWAYS disconnected with an
// overflow fault and unregistered, so it can never wedge the pump again; the overflow
// handler decides the observable side effects (overflow metric, and — for legacy
// single-subscriber FailFast — faulting the owning session). Multi-subscriber FailFast
// intentionally degrades to a plain disconnect (see SubscriberOverflowHandler docs): one
// slow consumer must not fault a session shared by other healthy subscribers.
private void OnSubscriberOverflow(Subscriber subscriber, ulong workerSequence)
{
// Snapshot whether this is the sole subscriber BEFORE we unregister it. This is the
// legacy single-subscriber mode used by the single-subscriber FailFast back-compat path.
bool isOnlySubscriber = _subscribers.Count == 1;
_logger.LogDebug(
"Event distributor disconnecting subscriber {SubscriberId} in session {SessionId} after queue overflow (worker sequence {WorkerSequence}).",
subscriber.Id,
_sessionId,
workerSequence);
// Observability + session-fault decision. Errors here must not stall the pump or
// leave the subscriber attached, so the disconnect below runs regardless.
try
{
_overflowHandler?.Invoke(isOnlySubscriber);
}
catch (Exception exception)
{
_logger.LogError(
exception,
"Event distributor overflow handler threw for session {SessionId}; disconnecting subscriber {SubscriberId} anyway.",
_sessionId,
subscriber.Id);
}
// Disconnect ONLY this subscriber: complete its channel with the overflow fault and
// remove it from the fan-out set. Its gRPC reader's MoveNextAsync then throws the
// SessionManagerException, which EventStreamService surfaces to the client exactly as
// the pre-epic per-RPC overflow did. The pump and every other subscriber are untouched.
if (_subscribers.TryRemove(subscriber.Id, out _))
{
subscriber.Channel.Writer.TryComplete(new SessionManagerException(
SessionManagerErrorCode.EventQueueOverflow,
$"Session {_sessionId} event stream queue overflowed."));
}
}
private void CompleteAllSubscribers(Exception? error)
{
foreach (Subscriber subscriber in _subscribers.Values)