feat(dashboard): mirror events via SessionEventDistributor subscriber (fixes dark feed without gRPC client)

This commit is contained in:
Joseph Doherty
2026-06-15 14:42:32 -04:00
parent 4f43733b96
commit 1ea08c3b10
9 changed files with 600 additions and 148 deletions
@@ -2,7 +2,6 @@ using System.Runtime.CompilerServices;
using Microsoft.Extensions.Options;
using ZB.MOM.WW.MxGateway.Contracts.Proto;
using ZB.MOM.WW.MxGateway.Server.Configuration;
using ZB.MOM.WW.MxGateway.Server.Dashboard.Hubs;
using ZB.MOM.WW.MxGateway.Server.Metrics;
using ZB.MOM.WW.MxGateway.Server.Sessions;
using ZB.MOM.WW.MxGateway.Server.Workers;
@@ -12,9 +11,7 @@ namespace ZB.MOM.WW.MxGateway.Server.Grpc;
public sealed class EventStreamService(
ISessionManager sessionManager,
IOptions<GatewayOptions> options,
GatewayMetrics metrics,
IDashboardEventBroadcaster dashboardEventBroadcaster,
ILogger<EventStreamService> logger) : IEventStreamService
GatewayMetrics metrics) : IEventStreamService
{
/// <summary>
/// Streams events from a session to the client asynchronously.
@@ -26,8 +23,19 @@ public sealed class EventStreamService(
/// <see cref="SessionEventDistributor"/> pump. The pump owns the single drain of
/// the worker event stream and the worker→public mapping (mirroring the former
/// <c>ProduceEventsAsync</c>); this loop is the per-subscriber boundary that
/// applies the per-RPC filter (<c>AfterWorkerSequence</c>), the dashboard mirror,
/// queue-depth metrics, and the backpressure/overflow policy.
/// applies the per-RPC filter (<c>AfterWorkerSequence</c>), queue-depth metrics,
/// and the backpressure/overflow policy.
/// </para>
/// <para>
/// Task 6 moved the dashboard mirror OFF this per-RPC loop. The dashboard is now a
/// first-class internal subscriber on the session's
/// <see cref="SessionEventDistributor"/> (see <c>GatewaySession.StartDashboardMirror</c>),
/// so it receives session events even when no gRPC client is streaming. This loop no
/// longer mirrors to the dashboard. One deliberate consequence: the dashboard now sees
/// RAW session events, not the per-gRPC-subscriber <c>AfterWorkerSequence</c>-filtered
/// view this loop applies — the dashboard is a separate LDAP-authenticated monitoring
/// view that should see the session's full event activity (per-session dashboard ACL is
/// the separate Task 18).
/// </para>
/// <para>
/// Overflow handling (Task 5): the distributor's per-subscriber channel is bounded
@@ -97,34 +105,12 @@ public sealed class EventStreamService(
// Per-RPC filter stays at the subscriber boundary: each request may resume
// from a different AfterWorkerSequence, so the shared pump fans raw events and
// this loop drops the ones at or below the caller's watermark. Filtered events
// are not mirrored to the dashboard, matching the pre-Task-4 ordering where
// the skip ran before the dashboard Publish.
// this loop drops the ones at or below the caller's watermark.
if (mxEvent.WorkerSequence <= afterWorkerSequence)
{
continue;
}
// Mirror the event to the dashboard EventsHub group for this session.
// Fire-and-forget — broadcast errors must not affect the source gRPC stream.
// Server-041: IDashboardEventBroadcaster documents Publish as never-throw,
// but we enforce that at the seam too so a future implementation that adds
// synchronous validation or a serializer hop cannot fault this loop and end
// the client's gRPC stream. (Task 6 will move this tap onto its own
// distributor subscriber; for Task 4 it coexists here, firing once per
// delivered event for the single subscriber exactly as before.)
try
{
dashboardEventBroadcaster.Publish(session.SessionId, mxEvent);
}
catch (Exception ex)
{
logger.LogDebug(
ex,
"Dashboard event mirror threw for session {SessionId}; continuing.",
session.SessionId);
}
// Queue-depth gauge tracks events the pump has fanned into this subscriber's
// channel but the client has not yet consumed — the same "buffered, not yet
// delivered" quantity the pre-Task-4 per-RPC channel reported. The bounded
@@ -1,6 +1,8 @@
using System.Runtime.CompilerServices;
using Microsoft.Extensions.Logging;
using ZB.MOM.WW.MxGateway.Contracts.Proto;
using ZB.MOM.WW.MxGateway.Server.Configuration;
using ZB.MOM.WW.MxGateway.Server.Dashboard.Hubs;
using ZB.MOM.WW.MxGateway.Server.Grpc;
using ZB.MOM.WW.MxGateway.Server.Metrics;
using ZB.MOM.WW.MxGateway.Server.Workers;
@@ -21,6 +23,10 @@ public sealed class GatewaySession
private int _activeEventSubscriberCount;
private SessionEventDistributor? _eventDistributor;
private bool _eventDistributorStarted;
private bool _dashboardMirrorStarted;
private IEventSubscriberLease? _dashboardMirrorLease;
private Task? _dashboardMirrorTask;
private CancellationTokenSource? _dashboardMirrorCts;
private readonly Dictionary<(int ServerHandle, int ItemHandle), SessionItemRegistration> _items = [];
/// <summary>
@@ -350,9 +356,22 @@ public sealed class GatewaySession
/// <summary>
/// Transitions the session to the Ready state.
/// </summary>
/// <remarks>
/// On becoming Ready the session starts its internal dashboard mirror (Task 6) when a
/// dashboard broadcaster was supplied. The mirror registers an internal subscriber on
/// the distributor and starts the pump <em>before</em> any gRPC client attaches, so the
/// dashboard EventsHub receives session events even with no gRPC subscriber streaming —
/// fixing the "dark feed" where the dashboard only saw events while a gRPC client was
/// actively streaming. Registering the internal subscriber BEFORE
/// <see cref="SessionEventDistributor.StartAsync"/> also avoids the Task 4 hazard where
/// starting the pump at Ready with zero subscribers drained a fast-completing worker
/// stream into nothing and left a later subscriber hanging: there is now always a
/// subscriber (the dashboard one) registered before the pump starts.
/// </remarks>
public void MarkReady()
{
TransitionTo(SessionState.Ready);
StartDashboardMirror();
}
// Constructs and starts the distributor exactly once, registering the subscriber under
@@ -369,8 +388,24 @@ public sealed class GatewaySession
// the start so the very first subscriber sees the stream from its beginning.
private IEventSubscriberLease StartDistributorAndRegister()
{
SessionEventDistributor distributor;
bool startNow = false;
SessionEventDistributor distributor = EnsureDistributorCreated(out bool startNow);
// Register BEFORE starting the pump so a subscriber is present when the pump begins
// draining — no event is fanned to an empty subscriber set and then missed by this
// first subscriber. StartAsync only schedules the pump task; it never blocks.
IEventSubscriberLease lease = distributor.Register();
StartPumpIfRequested(distributor, startNow);
return lease;
}
// Constructs the distributor exactly once and reports whether THIS caller is the one
// that should start the pump (i.e. it observed the unstarted state and claimed the
// start). Both the construction and the started-flag flip happen under _syncRoot so two
// concurrent callers (e.g. MarkReady's dashboard mirror and a racing first
// AttachEventSubscriber) agree on a single distributor and a single start.
private SessionEventDistributor EnsureDistributorCreated(out bool startNow)
{
lock (_syncRoot)
{
if (_eventDistributor is null)
@@ -387,28 +422,133 @@ public sealed class GatewaySession
CreateOverflowHandler(eventOptions.BackpressurePolicy));
}
distributor = _eventDistributor;
startNow = false;
if (!_eventDistributorStarted)
{
_eventDistributorStarted = true;
startNow = true;
}
}
// Register BEFORE starting the pump so a subscriber is present when the pump begins
// draining — no event is fanned to an empty subscriber set and then missed by this
// first subscriber. StartAsync only schedules the pump task; it never blocks.
IEventSubscriberLease lease = distributor.Register();
if (startNow)
return _eventDistributor;
}
}
private static void StartPumpIfRequested(SessionEventDistributor distributor, bool startNow)
{
if (!startNow)
{
// StartAsync only schedules the pump via Task.Run and returns a completed task;
// it does not perform any async I/O itself. The sync-over-async call here is
// therefore safe and will not deadlock. Do not make StartAsync truly async
// (i.e., await real I/O before returning) without also changing this call site.
distributor.StartAsync(CancellationToken.None).GetAwaiter().GetResult();
return;
}
return lease;
// StartAsync only schedules the pump via Task.Run and returns a completed task;
// it does not perform any async I/O itself. The sync-over-async call here is
// therefore safe and will not deadlock. Do not make StartAsync truly async
// (i.e., await real I/O before returning) without also changing this call site.
distributor.StartAsync(CancellationToken.None).GetAwaiter().GetResult();
}
// Registers the gateway-owned internal dashboard subscriber on the distributor and starts
// a background loop that mirrors every fanned event to the dashboard broadcaster. Called
// once when the session becomes Ready (idempotent). The internal subscriber is registered
// BEFORE the pump starts (see StartDistributorAndRegister / EnsureDistributorCreated), so
// a subscriber is always present at pump start — the dashboard receives events with no
// gRPC subscriber attached, and the Task 4 "zero-subscriber drain into the void" hang
// cannot occur. No-op when no dashboard broadcaster was supplied (unit tests).
private void StartDashboardMirror()
{
IDashboardEventBroadcaster? broadcaster = _eventStreaming.DashboardBroadcaster;
if (broadcaster is null)
{
return;
}
SessionEventDistributor distributor;
CancellationToken loopToken;
lock (_syncRoot)
{
if (_dashboardMirrorStarted || _state is SessionState.Closing or SessionState.Closed or SessionState.Faulted)
{
return;
}
_dashboardMirrorStarted = true;
_dashboardMirrorCts = new CancellationTokenSource();
loopToken = _dashboardMirrorCts.Token;
}
// Create the distributor (claiming the start if we are first) and register the
// internal subscriber BEFORE starting the pump. isInternal: true keeps the dashboard
// subscriber out of the single-subscriber overflow accounting, so a slow/broken
// dashboard mirror only disconnects itself and never faults the session.
distributor = EnsureDistributorCreated(out bool startNow);
IEventSubscriberLease lease = distributor.Register(isInternal: true);
StartPumpIfRequested(distributor, startNow);
lock (_syncRoot)
{
_dashboardMirrorLease = lease;
}
_dashboardMirrorTask = Task.Run(
() => RunDashboardMirrorAsync(broadcaster, lease, loopToken),
CancellationToken.None);
}
// Reads the internal dashboard subscriber's channel and publishes each RAW fanned event
// to the dashboard broadcaster. The dashboard is a first-class distributor subscriber
// (Task 6), so it sees the session's full raw event activity — NOT the per-gRPC-subscriber
// AfterWorkerSequence filtering that EventStreamService applies at its own boundary. This
// is intentional: the dashboard is a separate LDAP-authenticated monitoring view (per-
// session dashboard ACL is the separate Task 18). Publish is best-effort / never-throw, so
// a slow or broken dashboard cannot fault the session or stall the pump; the bounded
// internal subscriber channel (Task 5 per-subscriber isolation) only disconnects THIS
// mirror on overflow, leaving the session and other subscribers untouched.
private async Task RunDashboardMirrorAsync(
IDashboardEventBroadcaster broadcaster,
IEventSubscriberLease lease,
CancellationToken cancellationToken)
{
try
{
await foreach (MxEvent mxEvent in lease.Reader
.ReadAllAsync(cancellationToken)
.ConfigureAwait(false))
{
try
{
broadcaster.Publish(SessionId, mxEvent);
}
catch (Exception exception)
{
// Publish is documented never-throw, but enforce it here too so a future
// implementation cannot fault the mirror loop. Logs identifiers only.
_eventStreaming.DistributorLogger.LogDebug(
exception,
"Dashboard event mirror threw for session {SessionId}; continuing.",
SessionId);
}
}
}
catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested)
{
// Teardown path: the session is shutting down the mirror.
}
catch (SessionManagerException)
{
// The internal subscriber's channel overflowed and the distributor disconnected
// it with a terminal overflow fault. That disconnects only the dashboard mirror;
// the session, pump, and any gRPC subscriber are unaffected. Stop mirroring.
}
catch (Exception exception)
{
// Source-fault completion (worker event stream terminated abnormally) surfaces
// here. The session's own fault handling runs via the gRPC path / lifecycle; the
// mirror just stops. Logs identifiers only.
_eventStreaming.DistributorLogger.LogDebug(
exception,
"Dashboard event mirror loop ended for session {SessionId}.",
SessionId);
}
}
// Builds the per-subscriber backpressure handler the distributor invokes when a
@@ -1108,6 +1248,46 @@ public sealed class GatewaySession
{
}
// Stop the internal dashboard mirror first: cancel its loop, dispose its lease (which
// unregisters its internal distributor subscriber and completes its channel), and
// await the loop task. Done BEFORE disposing the distributor and worker client — like
// the distributor itself — so the mirror is no longer reading the pump when the pump
// and its source (the worker client) tear down.
IEventSubscriberLease? dashboardLease;
Task? dashboardTask;
CancellationTokenSource? dashboardCts;
lock (_syncRoot)
{
dashboardLease = _dashboardMirrorLease;
dashboardTask = _dashboardMirrorTask;
dashboardCts = _dashboardMirrorCts;
_dashboardMirrorLease = null;
_dashboardMirrorTask = null;
_dashboardMirrorCts = null;
}
if (dashboardCts is not null)
{
await dashboardCts.CancelAsync().ConfigureAwait(false);
}
dashboardLease?.Dispose();
if (dashboardTask is not null)
{
try
{
await dashboardTask.ConfigureAwait(false);
}
catch (Exception)
{
// The mirror loop swallows its own faults; any escape here must not block
// disposal. The loop has stopped, which is all teardown requires.
}
}
dashboardCts?.Dispose();
// Stop the event pump and complete every subscriber channel before tearing down the
// worker client (the pump's source). DisposeAsync is the single session teardown
// point (SessionManager.RemoveSessionAsync awaits it after close), so awaiting it
@@ -240,7 +240,19 @@ public sealed class SessionEventDistributor : IAsyncDisposable
/// subscriber and completes its channel without disturbing the pump or other
/// subscribers.
/// </summary>
public IEventSubscriberLease Register()
/// <param name="isInternal">
/// <see langword="true"/> for a gateway-owned internal subscriber (Task 6: the
/// session's dashboard mirror) that must NOT participate in the single-subscriber
/// overflow accounting. An internal subscriber is excluded from the
/// <c>isOnlySubscriber</c> count, so a lone external gRPC subscriber still reports
/// <c>isOnlySubscriber == true</c> (preserving legacy FailFast session-fault
/// behavior) even while the dashboard subscriber is attached; and an internal
/// subscriber that itself overflows always reports <c>isOnlySubscriber == false</c>,
/// so a slow/broken dashboard can never fault the session — it is merely
/// disconnected from the mirror. Defaults to <see langword="false"/> (external
/// subscriber) so every existing call site is unchanged.
/// </param>
public IEventSubscriberLease Register(bool isInternal = false)
{
// The pump is the single writer for this channel; readers are single-consumer
// (one gRPC stream / dashboard subscriber). Synchronous continuations are
@@ -265,7 +277,7 @@ public sealed class SessionEventDistributor : IAsyncDisposable
});
long id = Interlocked.Increment(ref _nextSubscriberId);
Subscriber subscriber = new(id, channel);
Subscriber subscriber = new(id, channel, isInternal);
// The disposed check AND the map add happen under the same lock with no await
// in between. DisposeAsync sets _disposed=true under this same lock before it
@@ -411,7 +423,14 @@ public sealed class SessionEventDistributor : IAsyncDisposable
// race window opens — a concurrent second registration could cause Count to read as 1
// here even with two subscribers, producing a false FailFast that faults a shared
// session. Resolve before enabling multi-subscriber.
bool isOnlySubscriber = _subscribers.Count == 1;
//
// Task 6: the gateway-owned internal dashboard subscriber is excluded from this
// accounting. (a) An internal subscriber that overflows is NEVER the "only subscriber"
// — a slow/broken dashboard must never fault the session, only disconnect its own
// mirror. (b) Internal subscribers are excluded from the count, so a lone external
// gRPC subscriber still reports isOnlySubscriber==true and preserves the legacy
// FailFast session-fault behavior even while the dashboard mirror is attached.
bool isOnlySubscriber = !subscriber.IsInternal && CountExternalSubscribers() == 1;
_logger.LogDebug(
"Event distributor disconnecting subscriber {SubscriberId} in session {SessionId} after queue overflow (worker sequence {WorkerSequence}).",
@@ -446,6 +465,22 @@ public sealed class SessionEventDistributor : IAsyncDisposable
}
}
// Counts external (non-internal) subscribers. Drives the isOnlySubscriber FailFast
// decision so the gateway-owned internal dashboard subscriber never inflates the count.
private int CountExternalSubscribers()
{
int count = 0;
foreach (Subscriber subscriber in _subscribers.Values)
{
if (!subscriber.IsInternal)
{
count++;
}
}
return count;
}
private void CompleteAllSubscribers(Exception? error)
{
foreach (Subscriber subscriber in _subscribers.Values)
@@ -593,11 +628,15 @@ public sealed class SessionEventDistributor : IAsyncDisposable
private readonly record struct ReplayEntry(MxEvent Event, DateTimeOffset RetainedAt);
private sealed class Subscriber(long id, Channel<MxEvent> channel)
private sealed class Subscriber(long id, Channel<MxEvent> channel, bool isInternal)
{
public long Id { get; } = id;
public Channel<MxEvent> Channel { get; } = channel;
// True for the gateway-owned internal dashboard subscriber. Excluded from the
// single-subscriber overflow accounting so it cannot fault the session.
public bool IsInternal { get; } = isInternal;
}
private sealed class SubscriberLease(SessionEventDistributor distributor, Subscriber subscriber)
@@ -1,5 +1,6 @@
using Microsoft.Extensions.Logging.Abstractions;
using ZB.MOM.WW.MxGateway.Server.Configuration;
using ZB.MOM.WW.MxGateway.Server.Dashboard.Hubs;
using ZB.MOM.WW.MxGateway.Server.Grpc;
using ZB.MOM.WW.MxGateway.Server.Metrics;
@@ -30,17 +31,25 @@ namespace ZB.MOM.WW.MxGateway.Server.Sessions;
/// fault. Carrying it here keeps the distributor decoupled from the metrics type while
/// preserving the observability the pre-epic per-RPC overflow path emitted.
/// </param>
/// <param name="DashboardBroadcaster">
/// Sink the session's internal dashboard mirror loop (Task 6) publishes raw session
/// <c>MxEvent</c>s to. When non-null the session registers an internal distributor
/// subscriber on becoming Ready and mirrors every fanned event to the dashboard
/// EventsHub group regardless of whether a gRPC client is streaming. When null
/// (unit tests that don't exercise the dashboard mirror) no mirror is started.
/// </param>
public sealed record SessionEventStreaming(
MxAccessGrpcMapper Mapper,
EventOptions EventOptions,
ILogger<SessionEventDistributor> DistributorLogger,
TimeProvider TimeProvider,
GatewayMetrics Metrics)
GatewayMetrics Metrics,
IDashboardEventBroadcaster? DashboardBroadcaster = null)
{
/// <summary>
/// Defaults used when a session is constructed without explicit streaming
/// dependencies (unit tests). Uses a fresh mapper, default event options, a no-op
/// logger, the system clock, and a fresh metrics sink.
/// logger, the system clock, a fresh metrics sink, and no dashboard mirror.
/// </summary>
public static SessionEventStreaming Default { get; } = new(
new MxAccessGrpcMapper(),
@@ -27,6 +27,7 @@ public sealed class SessionManager : ISessionManager
private readonly SemaphoreSlim _sessionSlots;
private readonly Grpc.MxAccessGrpcMapper _eventMapper;
private readonly ILogger<SessionEventDistributor> _distributorLogger;
private readonly Dashboard.Hubs.IDashboardEventBroadcaster? _dashboardEventBroadcaster;
/// <summary>
/// Initializes a new instance of <see cref="SessionManager"/>.
@@ -39,6 +40,12 @@ public sealed class SessionManager : ISessionManager
/// <param name="logger">Logger.</param>
/// <param name="eventMapper">Mapper used by each session's event distributor to map worker events to public events.</param>
/// <param name="distributorLogger">Logger passed to each session's event distributor pump.</param>
/// <param name="dashboardEventBroadcaster">
/// Dashboard SignalR fan-out sink. Each session registers an internal distributor
/// subscriber (Task 6) that mirrors raw session events to this broadcaster, so the
/// dashboard receives events regardless of whether a gRPC client is streaming. Null in
/// unit tests that do not exercise the dashboard mirror.
/// </param>
public SessionManager(
ISessionRegistry registry,
ISessionWorkerClientFactory workerClientFactory,
@@ -47,7 +54,8 @@ public sealed class SessionManager : ISessionManager
TimeProvider? timeProvider = null,
ILogger<SessionManager>? logger = null,
Grpc.MxAccessGrpcMapper? eventMapper = null,
ILogger<SessionEventDistributor>? distributorLogger = null)
ILogger<SessionEventDistributor>? distributorLogger = null,
Dashboard.Hubs.IDashboardEventBroadcaster? dashboardEventBroadcaster = null)
{
_registry = registry ?? throw new ArgumentNullException(nameof(registry));
_workerClientFactory = workerClientFactory ?? throw new ArgumentNullException(nameof(workerClientFactory));
@@ -57,6 +65,7 @@ public sealed class SessionManager : ISessionManager
_logger = logger ?? NullLogger<SessionManager>.Instance;
_eventMapper = eventMapper ?? new Grpc.MxAccessGrpcMapper();
_distributorLogger = distributorLogger ?? NullLogger<SessionEventDistributor>.Instance;
_dashboardEventBroadcaster = dashboardEventBroadcaster;
_options = options.Value;
_sessionSlots = new SemaphoreSlim(_options.Sessions.MaxSessions, _options.Sessions.MaxSessions);
}
@@ -451,7 +460,8 @@ public sealed class SessionManager : ISessionManager
_options.Events,
_distributorLogger,
_timeProvider,
_metrics);
_metrics,
_dashboardEventBroadcaster);
return new GatewaySession(
sessionId,