test(sessions): document overflow race safety + close backpressure coverage gaps

- Issue 1: document the isOnlySubscriber snapshot race-safety assumption in
  OnSubscriberOverflow; flags the Task 7/8 revisit point explicitly.
- Issue 2: pin StreamDisconnects==1 in the FailFast overflow test so a
  regression dropping the StreamDisconnected("Detached") finally call is caught.
- Issue 3: replace plain int/bool? reads in SlowSubscriberOverflow test with
  Volatile.Read/Write + Interlocked.Increment stores to close the C# memory
  model data race on overflowCalls and observedIsOnlySubscriber.
- Issue 4: add SlowSubscriberOverflow_WithMultipleSubscribers_... distributor
  test pinning that isOnlySubscriber==false disables the session-fault path;
  includes TODO(Task 8) note for the GatewaySession-level assertion.
- Issue 5: reword SubscriberOverflowHandler XML doc to make explicit that the
  handler must NOT complete the subscriber's channel; the distributor owns that.
This commit is contained in:
Joseph Doherty
2026-06-15 13:46:37 -04:00
parent 039111ca05
commit 4f43733b96
3 changed files with 102 additions and 11 deletions
@@ -6,11 +6,11 @@ namespace ZB.MOM.WW.MxGateway.Server.Sessions;
/// <summary>
/// Invoked by the pump (on the pump thread) when a subscriber's bounded channel is full
/// and the event cannot be written. The handler applies the per-subscriber backpressure
/// policy: it records the overflow metric and, in the legacy single-subscriber FailFast
/// case, faults the owning session. It does NOT complete the subscriber's channel — the
/// distributor always disconnects the offending subscriber with an overflow fault — so
/// the handler is purely observability plus the session-fault decision.
/// and the event cannot be written. The handler applies policy side-effects only:
/// it records the overflow metric and, in the legacy single-subscriber FailFast case,
/// faults the owning session. The handler MUST NOT complete the subscriber's channel —
/// the distributor performs the disconnect and channel-completion unconditionally,
/// regardless of what the handler does.
/// </summary>
/// <param name="isOnlySubscriber">
/// <see langword="true"/> when the overflowing subscriber is the sole registered
@@ -397,8 +397,20 @@ public sealed class SessionEventDistributor : IAsyncDisposable
// slow consumer must not fault a session shared by other healthy subscribers.
private void OnSubscriberOverflow(Subscriber subscriber, ulong workerSequence)
{
// Snapshot whether this is the sole subscriber BEFORE we unregister it. This is the
// legacy single-subscriber mode used by the single-subscriber FailFast back-compat path.
// Snapshot whether this is the sole subscriber BEFORE we unregister it. This drives
// the FailFast-fault-session-vs-disconnect decision: FailFast only faults the session
// when the overflowing subscriber is the sole subscriber.
//
// This snapshot is safe in v1 because AllowMultipleEventSubscribers=false is enforced
// by the validator and the single-subscriber guard in AttachEventSubscriber — a
// concurrent second registration is impossible, so the false-FailFast race (two
// subscribers, one overflows, Count reads as 1 after the other concurrently unregisters,
// FailFast wrongly faults the session) cannot occur today.
//
// REVISIT (Task 7/8): when multi-subscriber is enabled the guard is removed and the
// race window opens — a concurrent second registration could cause Count to read as 1
// here even with two subscribers, producing a false FailFast that faults a shared
// session. Resolve before enabling multi-subscriber.
bool isOnlySubscriber = _subscribers.Count == 1;
_logger.LogDebug(
@@ -206,6 +206,9 @@ public sealed class EventStreamServiceTests
GatewayMetricsSnapshot snapshot = metrics.GetSnapshot();
Assert.Equal(1, snapshot.QueueOverflows);
Assert.Equal(1, snapshot.Faults);
// The finally block in StreamEventsAsync calls StreamDisconnected("Detached") on the
// overflow+fault path too; pin it here so a regression removing that call is caught.
Assert.Equal(1, snapshot.StreamDisconnects);
}
/// <summary>
@@ -336,7 +336,12 @@ public sealed class SessionEventDistributorTests
// while a second, healthy subscriber keeps receiving and the pump keeps pumping.
Channel<MxEvent> source = Channel.CreateUnbounded<MxEvent>();
int overflowCalls = 0;
bool? observedIsOnlySubscriber = null;
// Separate fields for the bool value and the "set" flag so both can use
// Volatile.Read/Write; bool? is not valid for the volatile keyword on a local.
// Interlocked.Increment on the pump thread is the store for overflowCalls;
// Volatile.Read/Write provide ordering for observedIsOnlySubscriber.
int observedIsOnlySubscriberSet = 0;
bool observedIsOnlySubscriberValue = false;
await using SessionEventDistributor distributor = new(
"session-test",
ct => source.Reader.ReadAllAsync(ct),
@@ -348,7 +353,8 @@ public sealed class SessionEventDistributorTests
isOnlySubscriber =>
{
Interlocked.Increment(ref overflowCalls);
observedIsOnlySubscriber = isOnlySubscriber;
Volatile.Write(ref observedIsOnlySubscriberValue, isOnlySubscriber);
Volatile.Write(ref observedIsOnlySubscriberSet, 1);
});
await distributor.StartAsync(CancellationToken.None);
@@ -372,8 +378,11 @@ public sealed class SessionEventDistributorTests
Assert.Equal(SessionManagerErrorCode.EventQueueOverflow, fault.ErrorCode);
// Two subscribers were registered at overflow time, so isOnlySubscriber is false.
Assert.Equal(1, overflowCalls);
Assert.False(observedIsOnlySubscriber);
// Use Interlocked.Read / Volatile.Read so the test-thread reads are ordered after the
// pump-thread writes, avoiding a data race by the C# memory model.
Assert.Equal(1, Volatile.Read(ref overflowCalls));
Assert.Equal(1, Volatile.Read(ref observedIsOnlySubscriberSet));
Assert.False(Volatile.Read(ref observedIsOnlySubscriberValue));
Assert.Equal(1, distributor.SubscriberCount);
// The pump is still running and the healthy subscriber still receives new events.
@@ -382,6 +391,73 @@ public sealed class SessionEventDistributorTests
Assert.Equal(11ul, afterOverflow.WorkerSequence);
}
[Fact]
public async Task SlowSubscriberOverflow_WithMultipleSubscribers_HandlerSeesIsOnlySubscriberFalse_OtherKeepsReceiving()
{
// Distributor-level pin for "FailFast with multiple subscribers degrades to
// disconnect-only (no session fault)": when the overflowing subscriber is NOT the
// sole subscriber, isOnlySubscriber is false, so a FailFast-wired handler must NOT
// fault the session. This test drives the distributor directly (without GatewaySession)
// with two subscribers and a FailFast-style overflow handler seam, overflows the slow
// one, and asserts (a) isOnlySubscriber==false, (b) the other subscriber keeps
// receiving, and (c) the pump keeps running — all without a GatewaySession.
//
// TODO(Task 8): add a GatewaySession-level "session stays Ready" assertion once
// multi-subscriber config is enabled by the Tasks 7/8 validator/guard change.
Channel<MxEvent> source = Channel.CreateUnbounded<MxEvent>();
bool handlerFiredWithFalse = false;
bool sessionFaultWouldBeCalled = false; // tracks if a FailFast path would fault
await using SessionEventDistributor distributor = new(
"session-multi-sub",
ct => source.Reader.ReadAllAsync(ct),
subscriberQueueCapacity: 2,
replayBufferCapacity: 0,
replayRetentionSeconds: 0,
NullLogger<SessionEventDistributor>.Instance,
TimeProvider.System,
isOnlySubscriber =>
{
if (!isOnlySubscriber)
{
// Multi-subscriber: FailFast degrades to disconnect-only.
Volatile.Write(ref handlerFiredWithFalse, true);
}
else
{
// Single-subscriber: FailFast would fault the session — must not happen here.
Volatile.Write(ref sessionFaultWouldBeCalled, true);
}
});
await distributor.StartAsync(CancellationToken.None);
// Slow subscriber: never reads, so capacity-2 channel overflows quickly.
using IEventSubscriberLease slow = distributor.Register();
// Healthy subscriber: drains every event promptly.
using IEventSubscriberLease healthy = distributor.Register();
// Drive enough events to overflow the slow subscriber's channel.
for (ulong sequence = 1; sequence <= 10; sequence++)
{
source.Writer.TryWrite(Event(sequence));
_ = await ReadOneAsync(healthy.Reader);
}
// Slow subscriber is disconnected with the overflow fault.
SessionManagerException fault = await Assert.ThrowsAsync<SessionManagerException>(
async () => await DrainUntilFaultAsync(slow.Reader));
Assert.Equal(SessionManagerErrorCode.EventQueueOverflow, fault.ErrorCode);
// The handler saw isOnlySubscriber==false (multi-subscriber degradation path).
Assert.True(Volatile.Read(ref handlerFiredWithFalse));
// The FailFast session-fault branch was NOT taken (session stays Ready equivalent).
Assert.False(Volatile.Read(ref sessionFaultWouldBeCalled));
// The pump and healthy subscriber are unaffected.
source.Writer.TryWrite(Event(11));
MxEvent afterOverflow = await ReadOneAsync(healthy.Reader);
Assert.Equal(11ul, afterOverflow.WorkerSequence);
}
private static async Task DrainUntilFaultAsync(ChannelReader<MxEvent> reader)
{
// Drains any buffered events, then surfaces the channel's completion fault (if any)