feat(sessions): per-subscriber backpressure isolation in SessionEventDistributor

This commit is contained in:
Joseph Doherty
2026-06-15 13:39:25 -04:00
parent 61627fc5b0
commit 039111ca05
9 changed files with 308 additions and 66 deletions
@@ -2,6 +2,7 @@ using System.Runtime.CompilerServices;
using ZB.MOM.WW.MxGateway.Contracts.Proto;
using ZB.MOM.WW.MxGateway.Server.Configuration;
using ZB.MOM.WW.MxGateway.Server.Grpc;
using ZB.MOM.WW.MxGateway.Server.Metrics;
using ZB.MOM.WW.MxGateway.Server.Workers;
namespace ZB.MOM.WW.MxGateway.Server.Sessions;
@@ -382,7 +383,8 @@ public sealed class GatewaySession
eventOptions.ReplayBufferCapacity,
eventOptions.ReplayRetentionSeconds,
_eventStreaming.DistributorLogger,
_eventStreaming.TimeProvider);
_eventStreaming.TimeProvider,
CreateOverflowHandler(eventOptions.BackpressurePolicy));
}
distributor = _eventDistributor;
@@ -409,6 +411,34 @@ public sealed class GatewaySession
return lease;
}
// Builds the per-subscriber backpressure handler the distributor invokes when a
// subscriber's bounded channel overflows. The distributor always disconnects the
// offending subscriber with an EventQueueOverflow fault; this handler adds the
// observable side effects, preserving exactly what the pre-epic per-RPC overflow path
// emitted:
// - always record the queue-overflow metric;
// - FailFast in the legacy single-subscriber case (isOnlySubscriber): fault the whole
// session and record the fault metric, matching back-compat behavior;
// - FailFast with multiple subscribers, or DisconnectSubscriber in any case: do NOT
// fault the session — the distributor's disconnect of the one slow subscriber is the
// whole remedy, so other subscribers and the pump are unaffected. Multi-subscriber
// FailFast deliberately degrades to a disconnect because faulting a shared session on
// one slow consumer would punish healthy subscribers.
private SubscriberOverflowHandler CreateOverflowHandler(EventBackpressurePolicy policy)
{
GatewayMetrics metrics = _eventStreaming.Metrics;
return isOnlySubscriber =>
{
metrics.QueueOverflow("grpc-event-stream");
if (policy == EventBackpressurePolicy.FailFast && isOnlySubscriber)
{
MarkFaulted($"Session {SessionId} event stream queue overflowed.");
metrics.Fault(SessionManagerErrorCode.EventQueueOverflow.ToString());
}
};
}
// The distributor's single event source. Drains the worker event stream once (the
// distributor guarantees a single consumer) and maps each frame to the public MxEvent,
// preserving worker order. Mirrors the former ProduceEventsAsync mapping exactly.