Files
mxaccessgw/src/ZB.MOM.WW.MxGateway.Server/Sessions/SessionEventDistributor.cs
T
Joseph Doherty c2c518862f fix(sessions): replay-buffer gap edge cases, effective-config exposure, capacity-0 tests
#2: Replace afterSequence+1<oldestRetained with overflow-safe oldestRetained>0&&afterSequence<oldestRetained-1 to prevent ulong wrap at MaxValue falsely reporting gap=true.
#3: Add ReplayBufferCapacity and ReplayRetentionSeconds to EffectiveEventConfiguration and populate from EventOptions in GatewayConfigurationProvider.
#4: Add four new SessionEventDistributorTests covering capacity=0 gap/no-gap paths and the ulong.MaxValue boundary case.
#5: Update class-level <remarks> to describe the Task 3 replay ring buffer (capacity + age eviction, TryGetReplayFrom) rather than its absence.
#6: Add O(n)-is-acceptable comment at TryGetReplayFrom linear scan.
#8: Narrow no-replay 4-arg ctor to internal; InternalsVisibleTo already covers the test project.
2026-06-15 12:48:11 -04:00

531 lines
23 KiB
C#

using System.Collections.Concurrent;
using System.Threading.Channels;
using ZB.MOM.WW.MxGateway.Contracts.Proto;
namespace ZB.MOM.WW.MxGateway.Server.Sessions;
/// <summary>
/// Per-session event pump and fan-out. A single background task drains the
/// session's event source <em>exactly once</em> and fans each event out to
/// every currently-registered subscriber's own bounded channel.
/// </summary>
/// <remarks>
/// <para>
/// Introduced by Task 2 of the Session Resilience epic; the bounded replay ring
/// buffer was added by Task 3. The class is NOT yet wired into
/// <c>GatewaySession</c> or <c>EventStreamService</c> (Task 4), has no
/// per-subscriber backpressure-isolation policy (Task 5), and does not remove
/// the single-subscriber guard (Tasks 7/8). The ring buffer supports capacity
/// eviction (oldest entry dropped when the count exceeds
/// <c>replayBufferCapacity</c>) and age eviction (entries older than
/// <c>replayRetentionSeconds</c> dropped on the next append or query), and is
/// queried via <see cref="TryGetReplayFrom"/> by reconnecting subscribers.
/// </para>
/// <para>
/// <b>Source seam.</b> The event source is injected as a
/// <see cref="Func{T, TResult}"/> producing an
/// <see cref="IAsyncEnumerable{T}"/> of already-mapped public
/// <see cref="MxEvent"/>s, given a <see cref="CancellationToken"/>. This is the
/// cleanest seam for Task 4: it can pass
/// <c>ct =&gt; session.ReadEventsAsync(ct).Select(mapper.MapEvent)</c> (or a
/// channel reader's <c>ReadAllAsync</c>), while unit tests pass a plain
/// channel reader's <c>ReadAllAsync</c> with no real session. The pump owns the
/// single consumption of this enumerable; fan-out happens on the public
/// <see cref="MxEvent"/> after mapping, mirroring today's
/// <c>EventStreamService.ProduceEventsAsync</c> ordering.
/// </para>
/// <para>
/// <b>Concurrency.</b> The subscriber set is a
/// <see cref="ConcurrentDictionary{TKey, TValue}"/> keyed by a monotonic id.
/// The pump iterates it with a snapshot-free enumerator (which never throws on
/// concurrent add/remove), and <see cref="Register"/> / lease disposal mutate it
/// without any lock held across an <c>await</c>. Each subscriber channel has a
/// single writer — the pump — so per-channel writes never race. MXAccess parity:
/// events are fanned in the order received; the pump never reorders or
/// synthesizes events.
/// </para>
/// </remarks>
public sealed class SessionEventDistributor : IAsyncDisposable
{
/// <summary>
/// Bounded wait for the pump to stop during disposal. A source factory that
/// ignores cancellation must not hang dispose forever; after this window the
/// pump is abandoned and subscribers are completed anyway.
/// </summary>
private static readonly TimeSpan DefaultShutdownTimeout = TimeSpan.FromSeconds(5);
private readonly string _sessionId;
private readonly Func<CancellationToken, IAsyncEnumerable<MxEvent>> _eventSourceFactory;
private readonly int _subscriberQueueCapacity;
private readonly TimeSpan _shutdownTimeout;
private readonly ILogger<SessionEventDistributor> _logger;
private readonly TimeProvider _timeProvider;
private readonly ConcurrentDictionary<long, Subscriber> _subscribers = new();
private readonly CancellationTokenSource _shutdownCts = new();
private readonly object _lifecycleLock = new();
// Replay ring buffer. Appended on the pump thread and queried from arbitrary
// threads via TryGetReplayFrom, so every access is under _replayLock. The deque
// keeps events in ascending WorkerSequence order (the pump fans in source order),
// so the oldest retained event is always at the front. Capacity == 0 disables
// retention; RetentionSeconds <= 0 disables age-based eviction.
private readonly int _replayBufferCapacity;
private readonly TimeSpan _replayRetention;
private readonly bool _ageEvictionEnabled;
private readonly LinkedList<ReplayEntry> _replayBuffer = new();
private readonly object _replayLock = new();
private bool _anyEventSeen;
private ulong _highestSequenceSeen;
private long _nextSubscriberId;
private Task? _pumpTask;
private bool _started;
private bool _disposed;
/// <summary>
/// Initializes a per-session event distributor.
/// </summary>
/// <param name="sessionId">Owning session id, used only for logging context.</param>
/// <param name="eventSourceFactory">
/// Factory producing the session's event stream given a cancellation token.
/// The pump consumes this exactly once. See the type remarks for the seam Task 4
/// plugs into.
/// </param>
/// <param name="subscriberQueueCapacity">
/// Bounded capacity of each per-subscriber channel. Mirrors the gRPC event-stream
/// queue capacity shape used today.
/// </param>
/// <param name="logger">Logger for pump lifecycle diagnostics.</param>
/// <remarks>
/// This overload disables the replay ring buffer (capacity 0). Use the overload
/// taking replay parameters to retain events for reconnect/reattach replay.
/// Kept <c>internal</c> so production wiring (Task 4) cannot accidentally use
/// the no-replay path; tests reach it via <c>InternalsVisibleTo</c>.
/// </remarks>
internal SessionEventDistributor(
string sessionId,
Func<CancellationToken, IAsyncEnumerable<MxEvent>> eventSourceFactory,
int subscriberQueueCapacity,
ILogger<SessionEventDistributor> logger)
: this(
sessionId,
eventSourceFactory,
subscriberQueueCapacity,
replayBufferCapacity: 0,
replayRetentionSeconds: 0,
logger,
TimeProvider.System)
{
}
/// <summary>
/// Initializes a per-session event distributor with a bounded replay ring buffer.
/// </summary>
/// <param name="sessionId">Owning session id, used only for logging context.</param>
/// <param name="eventSourceFactory">
/// Factory producing the session's event stream given a cancellation token.
/// The pump consumes this exactly once. See the type remarks for the seam Task 4
/// plugs into.
/// </param>
/// <param name="subscriberQueueCapacity">
/// Bounded capacity of each per-subscriber channel. Mirrors the gRPC event-stream
/// queue capacity shape used today.
/// </param>
/// <param name="replayBufferCapacity">
/// Maximum number of events retained for replay. The oldest retained event is
/// evicted once this count is exceeded. <c>0</c> disables retention entirely.
/// </param>
/// <param name="replayRetentionSeconds">
/// Maximum age, in seconds, of a retained event. Entries older than this are
/// evicted regardless of capacity. <c>0</c> (or less) disables age-based eviction.
/// </param>
/// <param name="logger">Logger for pump lifecycle diagnostics.</param>
/// <param name="timeProvider">
/// Clock used to timestamp and age-evict replay entries. Inject a fake to make
/// age-eviction deterministic in tests.
/// </param>
public SessionEventDistributor(
string sessionId,
Func<CancellationToken, IAsyncEnumerable<MxEvent>> eventSourceFactory,
int subscriberQueueCapacity,
int replayBufferCapacity,
double replayRetentionSeconds,
ILogger<SessionEventDistributor> logger,
TimeProvider timeProvider)
{
ArgumentException.ThrowIfNullOrWhiteSpace(sessionId);
ArgumentNullException.ThrowIfNull(eventSourceFactory);
ArgumentOutOfRangeException.ThrowIfLessThan(subscriberQueueCapacity, 1);
ArgumentOutOfRangeException.ThrowIfNegative(replayBufferCapacity);
ArgumentOutOfRangeException.ThrowIfNegative(replayRetentionSeconds);
ArgumentNullException.ThrowIfNull(logger);
ArgumentNullException.ThrowIfNull(timeProvider);
_sessionId = sessionId;
_eventSourceFactory = eventSourceFactory;
_subscriberQueueCapacity = subscriberQueueCapacity;
_shutdownTimeout = DefaultShutdownTimeout;
_replayBufferCapacity = replayBufferCapacity;
_ageEvictionEnabled = replayRetentionSeconds > 0;
_replayRetention = _ageEvictionEnabled
? TimeSpan.FromSeconds(replayRetentionSeconds)
: TimeSpan.Zero;
_logger = logger;
_timeProvider = timeProvider;
}
/// <summary>
/// Gets the count of currently-registered subscribers.
/// </summary>
public int SubscriberCount => _subscribers.Count;
/// <summary>
/// Starts the background pump. Idempotent — a second call is a no-op.
/// </summary>
/// <param name="cancellationToken">Token observed only while starting.</param>
public Task StartAsync(CancellationToken cancellationToken)
{
cancellationToken.ThrowIfCancellationRequested();
lock (_lifecycleLock)
{
ObjectDisposedException.ThrowIf(_disposed, this);
if (_started)
{
return Task.CompletedTask;
}
_started = true;
_pumpTask = Task.Run(() => PumpAsync(_shutdownCts.Token), CancellationToken.None);
}
return Task.CompletedTask;
}
/// <summary>
/// Registers a new subscriber and returns its lease. The lease exposes the
/// subscriber's <see cref="ChannelReader{T}"/> and, when disposed, unregisters the
/// subscriber and completes its channel without disturbing the pump or other
/// subscribers.
/// </summary>
public IEventSubscriberLease Register()
{
// The pump is the single writer for this channel; readers are single-consumer
// (one gRPC stream / dashboard subscriber). Synchronous continuations are
// disabled so a slow reader can never stall the pump on its completion.
//
// FullMode is Wait but the pump currently writes with TryWrite (drop-on-full):
// these are deliberately opposite policies and only a placeholder. Task 5 owns
// the overflow policy and will reconcile them by either switching the pump to
// WriteAsync (true backpressure, honouring Wait) or changing this to a Drop mode.
// Do not "fix" the mismatch here — leave the decision to Task 5.
Channel<MxEvent> channel = Channel.CreateBounded<MxEvent>(
new BoundedChannelOptions(_subscriberQueueCapacity)
{
SingleReader = true,
SingleWriter = true,
FullMode = BoundedChannelFullMode.Wait,
AllowSynchronousContinuations = false,
});
long id = Interlocked.Increment(ref _nextSubscriberId);
Subscriber subscriber = new(id, channel);
// The disposed check AND the map add happen under the same lock with no await
// in between. DisposeAsync sets _disposed=true under this same lock before it
// calls CompleteAllSubscribers, so once disposal has begun no further subscriber
// can be added — closing the Register-after-DisposeAsync window that would
// otherwise leave a subscriber's channel never completed.
lock (_lifecycleLock)
{
ObjectDisposedException.ThrowIf(_disposed, this);
_subscribers[id] = subscriber;
}
return new SubscriberLease(this, subscriber);
}
/// <summary>
/// Stops the pump and completes all subscriber channels. Idempotent.
/// </summary>
public async ValueTask DisposeAsync()
{
Task? pumpTask;
lock (_lifecycleLock)
{
if (_disposed)
{
return;
}
_disposed = true;
pumpTask = _pumpTask;
}
// Signal the pump to stop. It must not block on a non-reading subscriber:
// it writes with non-blocking TryWrite, so cancellation tears it down promptly.
await _shutdownCts.CancelAsync().ConfigureAwait(false);
if (pumpTask is not null)
{
// Bound the wait: a source factory that ignores cancellation would otherwise
// hang dispose forever. If the pump does not stop in time we log and proceed
// to complete subscribers anyway; DisposeAsync must not throw on this path.
Task completed = await Task.WhenAny(pumpTask, Task.Delay(_shutdownTimeout)).ConfigureAwait(false);
if (!ReferenceEquals(completed, pumpTask))
{
_logger.LogWarning(
"Event distributor pump did not stop within {ShutdownTimeoutSeconds}s for session {SessionId}; completing subscribers and abandoning the pump.",
_shutdownTimeout.TotalSeconds,
_sessionId);
}
else
{
try
{
await pumpTask.ConfigureAwait(false);
}
catch (OperationCanceledException)
{
}
catch (Exception exception)
{
_logger.LogDebug(
exception,
"Event distributor pump faulted during shutdown for session {SessionId}.",
_sessionId);
}
}
}
CompleteAllSubscribers(error: null);
_shutdownCts.Dispose();
}
private async Task PumpAsync(CancellationToken cancellationToken)
{
try
{
await foreach (MxEvent mxEvent in _eventSourceFactory(cancellationToken)
.WithCancellation(cancellationToken)
.ConfigureAwait(false))
{
// Retain for replay BEFORE fan-out so a reconnecting subscriber that
// queries between fan-out and its own read still sees this event. Order
// is preserved: the pump is the single appender and events arrive in
// source order.
AppendToReplayBuffer(mxEvent);
// Enumerating a ConcurrentDictionary's Values never throws on concurrent
// add/remove; a subscriber registered mid-iteration may miss this event,
// which matches "late subscribers see events after they register".
foreach (Subscriber subscriber in _subscribers.Values)
{
// TODO(Task 5): define overflow policy (per-subscriber isolation —
// drop / disconnect / fault that one subscriber). For the Task 2
// skeleton, a non-blocking TryWrite that silently drops on a full
// channel is the placeholder so one slow reader never stalls the pump.
if (!subscriber.Channel.Writer.TryWrite(mxEvent))
{
// Visibility only — Task 5 owns the actual drop/backpressure policy.
// Logs identifiers (worker sequence, subscriber id, session) only,
// never the event payload or tag values.
_logger.LogDebug(
"Event distributor dropped event (worker sequence {WorkerSequence}) for subscriber {SubscriberId} in session {SessionId}: channel full.",
mxEvent.WorkerSequence,
subscriber.Id,
_sessionId);
}
}
}
CompleteAllSubscribers(error: null);
}
catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested)
{
// Shutdown path: DisposeAsync completes subscribers.
}
catch (Exception exception)
{
// Unexpected source fault (not the shutdown-cancellation path above) — visible
// by default so an event stream silently dying is not lost in Debug noise.
_logger.LogError(
exception,
"Event distributor source faulted for session {SessionId}.",
_sessionId);
CompleteAllSubscribers(exception);
}
}
private void CompleteAllSubscribers(Exception? error)
{
foreach (Subscriber subscriber in _subscribers.Values)
{
subscriber.Channel.Writer.TryComplete(error);
}
}
private void Unregister(Subscriber subscriber)
{
if (_subscribers.TryRemove(subscriber.Id, out _))
{
subscriber.Channel.Writer.TryComplete();
}
}
/// <summary>
/// Returns the retained events with <see cref="MxEvent.WorkerSequence"/> strictly
/// greater than <paramref name="afterSequence"/>, in ascending sequence order, so a
/// reconnecting or reattaching subscriber can replay what it missed.
/// </summary>
/// <param name="afterSequence">
/// The last worker sequence the caller already observed. Only events newer than this
/// are returned.
/// </param>
/// <param name="events">
/// The retained events newer than <paramref name="afterSequence"/>, in order. Never
/// null; empty when nothing newer is retained.
/// </param>
/// <param name="gap">
/// <see langword="true"/> when events between <paramref name="afterSequence"/> and the
/// oldest retained event were already evicted (by capacity or age), meaning the caller
/// missed events that can no longer be replayed and must re-snapshot. When
/// <see langword="true"/>, whatever IS still retained is still returned via
/// <paramref name="events"/>.
/// </param>
/// <returns>
/// Always <see langword="true"/> — the out parameters fully describe the result. The
/// return value exists for a fluent call shape and future extension.
/// </returns>
/// <remarks>
/// <para>Gap semantics, by buffer state:</para>
/// <list type="bullet">
/// <item>
/// Buffer non-empty: <paramref name="gap"/> is <see langword="true"/> iff
/// <paramref name="afterSequence"/> is below the oldest retained sequence minus
/// one (i.e. at least one event newer than <paramref name="afterSequence"/> but
/// older than the oldest retained was evicted). When
/// <paramref name="afterSequence"/> equals or exceeds the newest retained
/// sequence the caller is fully caught up: empty list, no gap.
/// </item>
/// <item>
/// Buffer empty (retention disabled, nothing seen yet, or everything evicted):
/// empty list, and <paramref name="gap"/> is <see langword="true"/> iff
/// <paramref name="afterSequence"/> is below the highest sequence ever seen —
/// i.e. the caller is behind but nothing is retained to replay. If no event has
/// ever been seen, or the caller is already at/ahead of the highest seen, there
/// is nothing to miss: no gap.
/// </item>
/// </list>
/// </remarks>
public bool TryGetReplayFrom(ulong afterSequence, out IReadOnlyList<MxEvent> events, out bool gap)
{
lock (_replayLock)
{
EvictAged();
if (_replayBuffer.Count == 0)
{
events = [];
// Nothing retained. The caller missed events only if it is behind the
// highest sequence ever seen (and we have seen at least one event).
gap = _anyEventSeen && afterSequence < _highestSequenceSeen;
return true;
}
ulong oldestRetained = _replayBuffer.First!.Value.Event.WorkerSequence;
// A gap exists when at least one event newer than afterSequence was evicted,
// i.e. afterSequence sits below the oldest-retained-minus-one boundary.
// Written as (oldestRetained > 0 && afterSequence < oldestRetained - 1) to
// avoid wrapping when afterSequence == ulong.MaxValue (afterSequence + 1
// would overflow to 0, falsely reporting a gap).
gap = oldestRetained > 0 && afterSequence < oldestRetained - 1;
// O(n) scan over the retained buffer — acceptable because TryGetReplayFrom
// is only called on subscriber reconnect, never on the hot fan-out path.
List<MxEvent> newer = [];
foreach (ReplayEntry entry in _replayBuffer)
{
if (entry.Event.WorkerSequence > afterSequence)
{
newer.Add(entry.Event);
}
}
events = newer;
return true;
}
}
private void AppendToReplayBuffer(MxEvent mxEvent)
{
lock (_replayLock)
{
_anyEventSeen = true;
if (mxEvent.WorkerSequence > _highestSequenceSeen)
{
_highestSequenceSeen = mxEvent.WorkerSequence;
}
// Capacity 0 disables retention: track the highest-seen sequence (so replay
// can still report a gap) but keep no events.
if (_replayBufferCapacity == 0)
{
return;
}
_replayBuffer.AddLast(new ReplayEntry(mxEvent, _timeProvider.GetUtcNow()));
// Capacity eviction: drop oldest until within bound.
while (_replayBuffer.Count > _replayBufferCapacity)
{
_replayBuffer.RemoveFirst();
}
EvictAged();
}
}
// Must be called under _replayLock. Drops entries older than the retention window.
private void EvictAged()
{
if (!_ageEvictionEnabled || _replayBuffer.Count == 0)
{
return;
}
DateTimeOffset cutoff = _timeProvider.GetUtcNow() - _replayRetention;
while (_replayBuffer.First is { } first && first.Value.RetainedAt < cutoff)
{
_replayBuffer.RemoveFirst();
}
}
private readonly record struct ReplayEntry(MxEvent Event, DateTimeOffset RetainedAt);
private sealed class Subscriber(long id, Channel<MxEvent> channel)
{
public long Id { get; } = id;
public Channel<MxEvent> Channel { get; } = channel;
}
private sealed class SubscriberLease(SessionEventDistributor distributor, Subscriber subscriber)
: IEventSubscriberLease
{
private int _leaseDisposed;
public ChannelReader<MxEvent> Reader => subscriber.Channel.Reader;
public void Dispose()
{
// Atomic check-and-set so concurrent Dispose calls unregister at most once.
if (Interlocked.Exchange(ref _leaseDisposed, 1) == 0)
{
distributor.Unregister(subscriber);
}
}
}
}