feat(sessions): multi-subscriber cap enforcement + mode-gated FailFast
This commit is contained in:
@@ -69,7 +69,8 @@ public sealed class EventStreamService(
|
||||
// block below, which also disposes the reader. A `using` declaration would add a
|
||||
// second Dispose on the same path and double-decrement the session subscriber count.
|
||||
IEventSubscriberLease subscriber = session.AttachEventSubscriber(
|
||||
options.Value.Sessions.AllowMultipleEventSubscribers);
|
||||
options.Value.Sessions.AllowMultipleEventSubscribers,
|
||||
options.Value.Sessions.MaxEventSubscribersPerSession);
|
||||
|
||||
int streamQueueDepth = 0;
|
||||
ulong afterWorkerSequence = request.AfterWorkerSequence;
|
||||
|
||||
@@ -949,6 +949,7 @@ public sealed class MxAccessGatewayService(
|
||||
SessionManagerErrorCode.SessionNotFound => StatusCode.NotFound,
|
||||
SessionManagerErrorCode.SessionNotReady => StatusCode.FailedPrecondition,
|
||||
SessionManagerErrorCode.EventSubscriberAlreadyActive => StatusCode.ResourceExhausted,
|
||||
SessionManagerErrorCode.EventSubscriberLimitReached => StatusCode.ResourceExhausted,
|
||||
SessionManagerErrorCode.EventQueueOverflow => StatusCode.ResourceExhausted,
|
||||
SessionManagerErrorCode.SessionLimitExceeded => StatusCode.ResourceExhausted,
|
||||
SessionManagerErrorCode.OpenFailed => StatusCode.Unavailable,
|
||||
|
||||
@@ -419,7 +419,8 @@ public sealed class GatewaySession
|
||||
eventOptions.ReplayRetentionSeconds,
|
||||
_eventStreaming.DistributorLogger,
|
||||
_eventStreaming.TimeProvider,
|
||||
CreateOverflowHandler(eventOptions.BackpressurePolicy));
|
||||
CreateOverflowHandler(eventOptions.BackpressurePolicy),
|
||||
singleSubscriberMode: !_eventStreaming.AllowMultipleEventSubscribers);
|
||||
}
|
||||
|
||||
startNow = false;
|
||||
@@ -680,14 +681,36 @@ public sealed class GatewaySession
|
||||
/// <summary>
|
||||
/// Attaches an event subscriber and returns a lease whose
|
||||
/// <see cref="IEventSubscriberLease.Reader"/> reads the fanned public
|
||||
/// <see cref="MxEvent"/>s for this subscriber. The single-subscriber guard
|
||||
/// (Tasks 7/8 relax it) is unchanged: with multi-subscriber disabled a second
|
||||
/// attach is rejected. The returned lease, when disposed, unregisters the
|
||||
/// distributor subscriber AND decrements the active-subscriber count.
|
||||
/// <see cref="MxEvent"/>s for this subscriber. The returned lease, when disposed,
|
||||
/// unregisters the distributor subscriber AND decrements the active-subscriber count.
|
||||
/// </summary>
|
||||
/// <param name="allowMultipleSubscribers">If true, allows multiple concurrent event subscribers.</param>
|
||||
public IEventSubscriberLease AttachEventSubscriber(bool allowMultipleSubscribers)
|
||||
/// <param name="allowMultipleSubscribers">
|
||||
/// When <see langword="false"/>, single-subscriber mode: a second concurrent EXTERNAL
|
||||
/// subscriber is rejected with <see cref="SessionManagerErrorCode.EventSubscriberAlreadyActive"/>.
|
||||
/// When <see langword="true"/>, multi-subscriber mode: up to
|
||||
/// <paramref name="maxSubscribers"/> concurrent EXTERNAL subscribers are allowed; the
|
||||
/// next attach is rejected with
|
||||
/// <see cref="SessionManagerErrorCode.EventSubscriberLimitReached"/>.
|
||||
/// </param>
|
||||
/// <param name="maxSubscribers">
|
||||
/// Maximum concurrent external subscribers in multi-subscriber mode
|
||||
/// (<c>MxGateway:Sessions:MaxEventSubscribersPerSession</c>). Ignored when
|
||||
/// <paramref name="allowMultipleSubscribers"/> is <see langword="false"/> (the effective
|
||||
/// cap is then 1). The gateway-owned internal dashboard subscriber is registered
|
||||
/// directly on the distributor and is NOT counted here, so it never consumes cap budget.
|
||||
/// </param>
|
||||
/// <remarks>
|
||||
/// The count-check-and-increment runs atomically under <c>_syncRoot</c>, so two
|
||||
/// concurrent attaches racing toward the cap can never both succeed past it. On
|
||||
/// distributor-register failure the count is rolled back (see the catch below).
|
||||
/// </remarks>
|
||||
public IEventSubscriberLease AttachEventSubscriber(bool allowMultipleSubscribers, int maxSubscribers)
|
||||
{
|
||||
// Effective cap: 1 in single-subscriber mode, otherwise the configured maximum
|
||||
// (clamped to at least 1 so a misconfigured non-positive value can never deadlock
|
||||
// attaches in multi-subscriber mode).
|
||||
int effectiveCap = allowMultipleSubscribers ? Math.Max(1, maxSubscribers) : 1;
|
||||
|
||||
lock (_syncRoot)
|
||||
{
|
||||
if (_state != SessionState.Ready || _workerClient?.State != WorkerClientState.Ready)
|
||||
@@ -697,11 +720,15 @@ public sealed class GatewaySession
|
||||
$"Session {SessionId} is not ready for event streaming. Current state is {_state}.");
|
||||
}
|
||||
|
||||
if (!allowMultipleSubscribers && _activeEventSubscriberCount > 0)
|
||||
if (_activeEventSubscriberCount >= effectiveCap)
|
||||
{
|
||||
throw new SessionManagerException(
|
||||
SessionManagerErrorCode.EventSubscriberAlreadyActive,
|
||||
$"Session {SessionId} already has an active event stream subscriber.");
|
||||
throw allowMultipleSubscribers
|
||||
? new SessionManagerException(
|
||||
SessionManagerErrorCode.EventSubscriberLimitReached,
|
||||
$"Session {SessionId} has reached its maximum of {effectiveCap} concurrent event stream subscribers.")
|
||||
: new SessionManagerException(
|
||||
SessionManagerErrorCode.EventSubscriberAlreadyActive,
|
||||
$"Session {SessionId} already has an active event stream subscriber.");
|
||||
}
|
||||
|
||||
_activeEventSubscriberCount++;
|
||||
|
||||
@@ -13,12 +13,16 @@ namespace ZB.MOM.WW.MxGateway.Server.Sessions;
|
||||
/// regardless of what the handler does.
|
||||
/// </summary>
|
||||
/// <param name="isOnlySubscriber">
|
||||
/// <see langword="true"/> when the overflowing subscriber is the sole registered
|
||||
/// subscriber at the moment of overflow (legacy single-subscriber mode). FailFast faults
|
||||
/// the session only in this case; with multiple subscribers FailFast degrades to a
|
||||
/// per-subscriber disconnect so one slow consumer never faults a session shared by others.
|
||||
/// Always <see langword="false"/> for internal subscribers (the dashboard mirror) because
|
||||
/// <see cref="SessionEventDistributor"/> excludes them from the external-subscriber count.
|
||||
/// <see langword="true"/> when FailFast is allowed to fault the whole session for this
|
||||
/// overflow. As of Task 8 this is gated on the SESSION MODE, not a live count: it is
|
||||
/// <see langword="true"/> only for an external subscriber in single-subscriber mode
|
||||
/// (<c>AllowMultipleEventSubscribers == false</c>), where at most one external subscriber
|
||||
/// can ever exist. In multi-subscriber mode it is always <see langword="false"/>, so
|
||||
/// FailFast degrades to a per-subscriber disconnect and one slow consumer never faults a
|
||||
/// session shared by others; gating on the fixed mode also removes the Task 5 race where a
|
||||
/// concurrent registration could make a count snapshot falsely report a sole subscriber.
|
||||
/// Always <see langword="false"/> for internal subscribers (the dashboard mirror) so a
|
||||
/// slow/broken dashboard can never fault the session.
|
||||
/// </param>
|
||||
/// <param name="isInternal">
|
||||
/// <see langword="true"/> when the overflowing subscriber is the gateway-owned internal
|
||||
@@ -40,8 +44,10 @@ public delegate void SubscriberOverflowHandler(bool isOnlySubscriber, bool isInt
|
||||
/// policy (Task 5) is implemented here: a slow subscriber overflows only its own
|
||||
/// bounded channel and the pump applies the policy to that subscriber alone (see
|
||||
/// <see cref="SubscriberOverflowHandler"/> and <c>OnSubscriberOverflow</c>), leaving
|
||||
/// the pump, the session, and other subscribers running. The class does not yet
|
||||
/// remove the single-subscriber guard (Tasks 7/8). The ring buffer supports capacity
|
||||
/// the pump, the session, and other subscribers running. Task 8 made the
|
||||
/// FailFast-faults-session decision mode-gated: it fires only in single-subscriber
|
||||
/// mode (<c>singleSubscriberMode</c>), so multi-subscriber FailFast always degrades to
|
||||
/// a per-subscriber disconnect — see <c>OnSubscriberOverflow</c>. The ring buffer supports capacity
|
||||
/// eviction (oldest entry dropped when the count exceeds
|
||||
/// <c>replayBufferCapacity</c>) and age eviction (entries older than
|
||||
/// <c>replayRetentionSeconds</c> dropped on the next append or query), and is
|
||||
@@ -83,6 +89,7 @@ public sealed class SessionEventDistributor : IAsyncDisposable
|
||||
private readonly string _sessionId;
|
||||
private readonly Func<CancellationToken, IAsyncEnumerable<MxEvent>> _eventSourceFactory;
|
||||
private readonly int _subscriberQueueCapacity;
|
||||
private readonly bool _singleSubscriberMode;
|
||||
private readonly SubscriberOverflowHandler? _overflowHandler;
|
||||
private readonly TimeSpan _shutdownTimeout;
|
||||
private readonly ILogger<SessionEventDistributor> _logger;
|
||||
@@ -134,7 +141,8 @@ public sealed class SessionEventDistributor : IAsyncDisposable
|
||||
Func<CancellationToken, IAsyncEnumerable<MxEvent>> eventSourceFactory,
|
||||
int subscriberQueueCapacity,
|
||||
ILogger<SessionEventDistributor> logger,
|
||||
SubscriberOverflowHandler? overflowHandler = null)
|
||||
SubscriberOverflowHandler? overflowHandler = null,
|
||||
bool singleSubscriberMode = true)
|
||||
: this(
|
||||
sessionId,
|
||||
eventSourceFactory,
|
||||
@@ -143,7 +151,8 @@ public sealed class SessionEventDistributor : IAsyncDisposable
|
||||
replayRetentionSeconds: 0,
|
||||
logger,
|
||||
TimeProvider.System,
|
||||
overflowHandler)
|
||||
overflowHandler,
|
||||
singleSubscriberMode)
|
||||
{
|
||||
}
|
||||
|
||||
@@ -181,6 +190,17 @@ public sealed class SessionEventDistributor : IAsyncDisposable
|
||||
/// handler. When <see langword="null"/> (unit/skeleton use) the offending subscriber is
|
||||
/// still disconnected but no metric/fault side effect runs.
|
||||
/// </param>
|
||||
/// <param name="singleSubscriberMode">
|
||||
/// <see langword="true"/> when the owning session is in single-subscriber mode
|
||||
/// (<c>AllowMultipleEventSubscribers == false</c>). This gates the FailFast
|
||||
/// session-fault decision in <c>OnSubscriberOverflow</c>: an external subscriber that
|
||||
/// overflows reports <c>isOnlySubscriber == true</c> (legacy FailFast faults the
|
||||
/// session) ONLY in single-subscriber mode. In multi-subscriber mode it is always
|
||||
/// <see langword="false"/>, so FailFast degrades to a per-subscriber disconnect and a
|
||||
/// transient registration race can never falsely fault a shared session (Task 8;
|
||||
/// resolves the Task 5 REVISIT race). Defaults to <see langword="true"/> so existing
|
||||
/// call sites and unit tests keep legacy single-subscriber FailFast behavior.
|
||||
/// </param>
|
||||
public SessionEventDistributor(
|
||||
string sessionId,
|
||||
Func<CancellationToken, IAsyncEnumerable<MxEvent>> eventSourceFactory,
|
||||
@@ -189,7 +209,8 @@ public sealed class SessionEventDistributor : IAsyncDisposable
|
||||
double replayRetentionSeconds,
|
||||
ILogger<SessionEventDistributor> logger,
|
||||
TimeProvider timeProvider,
|
||||
SubscriberOverflowHandler? overflowHandler = null)
|
||||
SubscriberOverflowHandler? overflowHandler = null,
|
||||
bool singleSubscriberMode = true)
|
||||
{
|
||||
ArgumentException.ThrowIfNullOrWhiteSpace(sessionId);
|
||||
ArgumentNullException.ThrowIfNull(eventSourceFactory);
|
||||
@@ -202,6 +223,7 @@ public sealed class SessionEventDistributor : IAsyncDisposable
|
||||
_sessionId = sessionId;
|
||||
_eventSourceFactory = eventSourceFactory;
|
||||
_subscriberQueueCapacity = subscriberQueueCapacity;
|
||||
_singleSubscriberMode = singleSubscriberMode;
|
||||
_overflowHandler = overflowHandler;
|
||||
_shutdownTimeout = DefaultShutdownTimeout;
|
||||
_replayBufferCapacity = replayBufferCapacity;
|
||||
@@ -416,28 +438,25 @@ public sealed class SessionEventDistributor : IAsyncDisposable
|
||||
// slow consumer must not fault a session shared by other healthy subscribers.
|
||||
private void OnSubscriberOverflow(Subscriber subscriber, ulong workerSequence)
|
||||
{
|
||||
// Snapshot whether this is the sole subscriber BEFORE we unregister it. This drives
|
||||
// the FailFast-fault-session-vs-disconnect decision: FailFast only faults the session
|
||||
// when the overflowing subscriber is the sole subscriber.
|
||||
// Decide whether FailFast may fault the whole session for this overflow. This is the
|
||||
// "isOnlySubscriber" signal the legacy single-subscriber FailFast path keys on.
|
||||
//
|
||||
// This snapshot is safe in v1 because AllowMultipleEventSubscribers=false is enforced
|
||||
// by the validator and the single-subscriber guard in AttachEventSubscriber — a
|
||||
// concurrent second registration is impossible, so the false-FailFast race (two
|
||||
// subscribers, one overflows, Count reads as 1 after the other concurrently unregisters,
|
||||
// FailFast wrongly faults the session) cannot occur today.
|
||||
// Task 8 resolution of the Task 5/7 REVISIT race: gate this on the SESSION MODE
|
||||
// (_singleSubscriberMode), NOT on a live count snapshot. The old
|
||||
// `CountExternalSubscribers() == 1` snapshot raced once multi-subscriber became real —
|
||||
// a concurrent second registration/unregistration could make the count read as 1 with
|
||||
// two subscribers actually present, producing a false FailFast that faults a shared
|
||||
// session. The mode is fixed for the session's lifetime, so reading it is race-free:
|
||||
// - single-subscriber mode: at most one external subscriber can ever exist (the
|
||||
// AttachEventSubscriber guard enforces it), so an overflowing external subscriber
|
||||
// IS the sole subscriber — preserve the legacy FailFast session-fault behavior.
|
||||
// - multi-subscriber mode: never fault the shared session; FailFast degrades to a
|
||||
// per-subscriber disconnect so one slow consumer cannot punish healthy ones.
|
||||
//
|
||||
// REVISIT (Task 7/8): when multi-subscriber is enabled the guard is removed and the
|
||||
// race window opens — a concurrent second registration could cause Count to read as 1
|
||||
// here even with two subscribers, producing a false FailFast that faults a shared
|
||||
// session. Resolve before enabling multi-subscriber.
|
||||
//
|
||||
// Task 6: the gateway-owned internal dashboard subscriber is excluded from this
|
||||
// accounting. (a) An internal subscriber that overflows is NEVER the "only subscriber"
|
||||
// — a slow/broken dashboard must never fault the session, only disconnect its own
|
||||
// mirror. (b) Internal subscribers are excluded from the count, so a lone external
|
||||
// gRPC subscriber still reports isOnlySubscriber==true and preserves the legacy
|
||||
// FailFast session-fault behavior even while the dashboard mirror is attached.
|
||||
bool isOnlySubscriber = !subscriber.IsInternal && CountExternalSubscribers() == 1;
|
||||
// Task 6: the gateway-owned internal dashboard subscriber is excluded — an internal
|
||||
// subscriber that overflows is NEVER the "only subscriber", so a slow/broken dashboard
|
||||
// can only disconnect its own mirror and never fault the session.
|
||||
bool isOnlySubscriber = !subscriber.IsInternal && _singleSubscriberMode;
|
||||
|
||||
_logger.LogDebug(
|
||||
"Event distributor disconnecting subscriber {SubscriberId} in session {SessionId} after queue overflow (worker sequence {WorkerSequence}).",
|
||||
@@ -473,22 +492,6 @@ public sealed class SessionEventDistributor : IAsyncDisposable
|
||||
}
|
||||
}
|
||||
|
||||
// Counts external (non-internal) subscribers. Drives the isOnlySubscriber FailFast
|
||||
// decision so the gateway-owned internal dashboard subscriber never inflates the count.
|
||||
private int CountExternalSubscribers()
|
||||
{
|
||||
int count = 0;
|
||||
foreach (Subscriber subscriber in _subscribers.Values)
|
||||
{
|
||||
if (!subscriber.IsInternal)
|
||||
{
|
||||
count++;
|
||||
}
|
||||
}
|
||||
|
||||
return count;
|
||||
}
|
||||
|
||||
private void CompleteAllSubscribers(Exception? error)
|
||||
{
|
||||
foreach (Subscriber subscriber in _subscribers.Values)
|
||||
|
||||
@@ -38,13 +38,24 @@ namespace ZB.MOM.WW.MxGateway.Server.Sessions;
|
||||
/// EventsHub group regardless of whether a gRPC client is streaming. When null
|
||||
/// (unit tests that don't exercise the dashboard mirror) no mirror is started.
|
||||
/// </param>
|
||||
/// <param name="AllowMultipleEventSubscribers">
|
||||
/// The session's effective multi-subscriber mode (Task 8). Carried here so the session
|
||||
/// can pass it to its <see cref="SessionEventDistributor"/> at construction — the
|
||||
/// distributor is created at <c>MarkReady</c> (for the dashboard mirror) before any gRPC
|
||||
/// subscriber attaches, so the mode cannot be learned from a later
|
||||
/// <c>AttachEventSubscriber</c> call. The distributor gates its FailFast session-fault
|
||||
/// decision on this mode (single-subscriber only) instead of a live count snapshot,
|
||||
/// closing the Task 5 false-FailFast race. Defaults to <see langword="false"/>
|
||||
/// (single-subscriber) so existing call sites and unit tests are unchanged.
|
||||
/// </param>
|
||||
public sealed record SessionEventStreaming(
|
||||
MxAccessGrpcMapper Mapper,
|
||||
EventOptions EventOptions,
|
||||
ILogger<SessionEventDistributor> DistributorLogger,
|
||||
TimeProvider TimeProvider,
|
||||
GatewayMetrics Metrics,
|
||||
IDashboardEventBroadcaster? DashboardBroadcaster = null)
|
||||
IDashboardEventBroadcaster? DashboardBroadcaster = null,
|
||||
bool AllowMultipleEventSubscribers = false)
|
||||
{
|
||||
/// <summary>
|
||||
/// Defaults used when a session is constructed without explicit streaming
|
||||
|
||||
@@ -461,7 +461,8 @@ public sealed class SessionManager : ISessionManager
|
||||
_distributorLogger,
|
||||
_timeProvider,
|
||||
_metrics,
|
||||
_dashboardEventBroadcaster);
|
||||
_dashboardEventBroadcaster,
|
||||
_options.Sessions.AllowMultipleEventSubscribers);
|
||||
|
||||
return new GatewaySession(
|
||||
sessionId,
|
||||
|
||||
@@ -5,6 +5,7 @@ public enum SessionManagerErrorCode
|
||||
SessionNotFound,
|
||||
SessionNotReady,
|
||||
EventSubscriberAlreadyActive,
|
||||
EventSubscriberLimitReached,
|
||||
EventQueueOverflow,
|
||||
SessionLimitExceeded,
|
||||
OpenFailed,
|
||||
|
||||
Reference in New Issue
Block a user