fix(dashboard): close StartDashboardMirror/DisposeAsync race; internal-overflow test + metric label
(1) GatewaySession.StartDashboardMirror: publish _dashboardMirrorLease and _dashboardMirrorTask
atomically under one _syncRoot section; if the session is already Closing/Closed/Faulted,
dispose the just-created lease and return without starting the mirror task so nothing is orphaned.
(2) WaitUntilAsync test helper: catch OperationCanceledException and call Assert.Fail with the
timeout duration and predicate source text instead of letting the exception propagate raw.
(3) New SessionEventDistributorTests.InternalSubscriberOverflow_HandlerSeesIsOnlySubscriberFalse:
verifies CountExternalSubscribers excludes the internal subscriber, so isOnlySubscriber==false
even when the internal subscriber is the only registered subscriber.
(4) SubscriberOverflowHandler delegate gains isInternal parameter; overflow metric label is
"dashboard-mirror" for internal subscribers and "grpc-event-stream" for external ones.
(5) DashboardEventBroadcaster.Publish: wrap SendAsync Task acquisition in try/catch so a
synchronous throw cannot escape the never-throw Publish interface contract.
This commit is contained in:
@@ -24,9 +24,21 @@ public sealed class DashboardEventBroadcaster(
|
||||
return;
|
||||
}
|
||||
|
||||
Task send = hubContext.Clients
|
||||
.Group(EventsHub.GroupName(sessionId))
|
||||
.SendAsync(EventsHub.EventMessage, mxEvent);
|
||||
// Wrap the Task acquisition in a try/catch so a hypothetical synchronous throw
|
||||
// from SendAsync (e.g. an implementation that throws before returning the Task)
|
||||
// cannot escape Publish. The interface contract is never-throw; fire-and-forget.
|
||||
Task send;
|
||||
try
|
||||
{
|
||||
send = hubContext.Clients
|
||||
.Group(EventsHub.GroupName(sessionId))
|
||||
.SendAsync(EventsHub.EventMessage, mxEvent);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
logger.LogDebug(ex, "Dashboard event mirror to session {SessionId} threw synchronously.", sessionId);
|
||||
return;
|
||||
}
|
||||
|
||||
if (!send.IsCompletedSuccessfully)
|
||||
{
|
||||
|
||||
@@ -454,6 +454,14 @@ public sealed class GatewaySession
|
||||
// a subscriber is always present at pump start — the dashboard receives events with no
|
||||
// gRPC subscriber attached, and the Task 4 "zero-subscriber drain into the void" hang
|
||||
// cannot occur. No-op when no dashboard broadcaster was supplied (unit tests).
|
||||
//
|
||||
// Race-safety (Issue 1): _dashboardMirrorLease and _dashboardMirrorTask are published
|
||||
// atomically under a SINGLE second lock section, and DisposeAsync reads/nulls them under
|
||||
// that same lock. After EnsureDistributorCreated/Register/StartPump (all outside _syncRoot
|
||||
// to avoid lock inversion with the distributor's own lifecycle lock), we re-enter
|
||||
// _syncRoot and check for concurrent disposal. If the session is already Closing/Closed/
|
||||
// Faulted at that point, we dispose the just-created lease immediately and do NOT start
|
||||
// the mirror task, so nothing is orphaned.
|
||||
private void StartDashboardMirror()
|
||||
{
|
||||
IDashboardEventBroadcaster? broadcaster = _eventStreaming.DashboardBroadcaster;
|
||||
@@ -462,7 +470,6 @@ public sealed class GatewaySession
|
||||
return;
|
||||
}
|
||||
|
||||
SessionEventDistributor distributor;
|
||||
CancellationToken loopToken;
|
||||
lock (_syncRoot)
|
||||
{
|
||||
@@ -480,18 +487,31 @@ public sealed class GatewaySession
|
||||
// internal subscriber BEFORE starting the pump. isInternal: true keeps the dashboard
|
||||
// subscriber out of the single-subscriber overflow accounting, so a slow/broken
|
||||
// dashboard mirror only disconnects itself and never faults the session.
|
||||
distributor = EnsureDistributorCreated(out bool startNow);
|
||||
// These three calls are OUTSIDE _syncRoot to avoid holding it across
|
||||
// EnsureDistributorCreated's own lock and StartAsync's Task.Run.
|
||||
SessionEventDistributor distributor = EnsureDistributorCreated(out bool startNow);
|
||||
IEventSubscriberLease lease = distributor.Register(isInternal: true);
|
||||
StartPumpIfRequested(distributor, startNow);
|
||||
|
||||
// Publish BOTH the lease and the task atomically under one lock section so
|
||||
// DisposeAsync always sees them in a consistent state: either both are set or
|
||||
// both are null. If the session already started disposal before we got here,
|
||||
// dispose the lease immediately instead of orphaning it.
|
||||
lock (_syncRoot)
|
||||
{
|
||||
_dashboardMirrorLease = lease;
|
||||
}
|
||||
if (_state is SessionState.Closing or SessionState.Closed or SessionState.Faulted)
|
||||
{
|
||||
// Disposal already ran (or is in progress) — discard the just-created
|
||||
// lease now so it is not orphaned. Do NOT launch the mirror task.
|
||||
lease.Dispose();
|
||||
return;
|
||||
}
|
||||
|
||||
_dashboardMirrorTask = Task.Run(
|
||||
() => RunDashboardMirrorAsync(broadcaster, lease, loopToken),
|
||||
CancellationToken.None);
|
||||
_dashboardMirrorLease = lease;
|
||||
_dashboardMirrorTask = Task.Run(
|
||||
() => RunDashboardMirrorAsync(broadcaster, lease, loopToken),
|
||||
CancellationToken.None);
|
||||
}
|
||||
}
|
||||
|
||||
// Reads the internal dashboard subscriber's channel and publishes each RAW fanned event
|
||||
@@ -556,7 +576,7 @@ public sealed class GatewaySession
|
||||
// offending subscriber with an EventQueueOverflow fault; this handler adds the
|
||||
// observable side effects, preserving exactly what the pre-epic per-RPC overflow path
|
||||
// emitted:
|
||||
// - always record the queue-overflow metric;
|
||||
// - always record the queue-overflow metric, labeled by subscriber kind;
|
||||
// - FailFast in the legacy single-subscriber case (isOnlySubscriber): fault the whole
|
||||
// session and record the fault metric, matching back-compat behavior;
|
||||
// - FailFast with multiple subscribers, or DisconnectSubscriber in any case: do NOT
|
||||
@@ -564,16 +584,23 @@ public sealed class GatewaySession
|
||||
// whole remedy, so other subscribers and the pump are unaffected. Multi-subscriber
|
||||
// FailFast deliberately degrades to a disconnect because faulting a shared session on
|
||||
// one slow consumer would punish healthy subscribers.
|
||||
// The delegate now carries isInternal directly (Issue 4), so the metric label is chosen
|
||||
// without any heuristic: "dashboard-mirror" for internal, "grpc-event-stream" for external.
|
||||
private SubscriberOverflowHandler CreateOverflowHandler(EventBackpressurePolicy policy)
|
||||
{
|
||||
GatewayMetrics metrics = _eventStreaming.Metrics;
|
||||
return isOnlySubscriber =>
|
||||
string sessionId = SessionId;
|
||||
return (isOnlySubscriber, isInternal) =>
|
||||
{
|
||||
metrics.QueueOverflow("grpc-event-stream");
|
||||
// Label the overflow metric by subscriber kind. The distributor passes isInternal
|
||||
// directly, so no heuristic is needed to distinguish an internal overflow (the
|
||||
// gateway-owned dashboard mirror) from an external one (a gRPC streaming client).
|
||||
string label = isInternal ? "dashboard-mirror" : "grpc-event-stream";
|
||||
metrics.QueueOverflow(label);
|
||||
|
||||
if (policy == EventBackpressurePolicy.FailFast && isOnlySubscriber)
|
||||
{
|
||||
MarkFaulted($"Session {SessionId} event stream queue overflowed.");
|
||||
MarkFaulted($"Session {sessionId} event stream queue overflowed.");
|
||||
metrics.Fault(SessionManagerErrorCode.EventQueueOverflow.ToString());
|
||||
}
|
||||
};
|
||||
|
||||
@@ -17,8 +17,15 @@ namespace ZB.MOM.WW.MxGateway.Server.Sessions;
|
||||
/// subscriber at the moment of overflow (legacy single-subscriber mode). FailFast faults
|
||||
/// the session only in this case; with multiple subscribers FailFast degrades to a
|
||||
/// per-subscriber disconnect so one slow consumer never faults a session shared by others.
|
||||
/// Always <see langword="false"/> for internal subscribers (the dashboard mirror) because
|
||||
/// <see cref="SessionEventDistributor"/> excludes them from the external-subscriber count.
|
||||
/// </param>
|
||||
public delegate void SubscriberOverflowHandler(bool isOnlySubscriber);
|
||||
/// <param name="isInternal">
|
||||
/// <see langword="true"/> when the overflowing subscriber is the gateway-owned internal
|
||||
/// dashboard mirror subscriber. The handler uses this to choose the correct metric label
|
||||
/// (<c>"dashboard-mirror"</c> vs <c>"grpc-event-stream"</c>).
|
||||
/// </param>
|
||||
public delegate void SubscriberOverflowHandler(bool isOnlySubscriber, bool isInternal);
|
||||
|
||||
/// <summary>
|
||||
/// Per-session event pump and fan-out. A single background task drains the
|
||||
@@ -440,9 +447,10 @@ public sealed class SessionEventDistributor : IAsyncDisposable
|
||||
|
||||
// Observability + session-fault decision. Errors here must not stall the pump or
|
||||
// leave the subscriber attached, so the disconnect below runs regardless.
|
||||
// Pass subscriber.IsInternal so the handler can choose the correct metric label.
|
||||
try
|
||||
{
|
||||
_overflowHandler?.Invoke(isOnlySubscriber);
|
||||
_overflowHandler?.Invoke(isOnlySubscriber, subscriber.IsInternal);
|
||||
}
|
||||
catch (Exception exception)
|
||||
{
|
||||
|
||||
Reference in New Issue
Block a user