using System.Collections.Concurrent;
using System.Threading.Channels;
using ZB.MOM.WW.MxGateway.Contracts.Proto;
namespace ZB.MOM.WW.MxGateway.Server.Sessions;
///
/// Per-session event pump and fan-out. A single background task drains the
/// session's event source exactly once and fans each event out to
/// every currently-registered subscriber's own bounded channel.
///
///
///
/// Introduced by Task 2 of the Session Resilience epic; the bounded replay ring
/// buffer was added by Task 3. The class is NOT yet wired into
/// GatewaySession or EventStreamService (Task 4), has no
/// per-subscriber backpressure-isolation policy (Task 5), and does not remove
/// the single-subscriber guard (Tasks 7/8). The ring buffer supports capacity
/// eviction (oldest entry dropped when the count exceeds
/// replayBufferCapacity) and age eviction (entries older than
/// replayRetentionSeconds dropped on the next append or query), and is
/// queried via by reconnecting subscribers.
///
///
/// Source seam. The event source is injected as a
/// producing an
/// of already-mapped public
/// s, given a . This is the
/// cleanest seam for Task 4: it can pass
/// ct => session.ReadEventsAsync(ct).Select(mapper.MapEvent) (or a
/// channel reader's ReadAllAsync), while unit tests pass a plain
/// channel reader's ReadAllAsync with no real session. The pump owns the
/// single consumption of this enumerable; fan-out happens on the public
/// after mapping, mirroring today's
/// EventStreamService.ProduceEventsAsync ordering.
///
///
/// Concurrency. The subscriber set is a
/// keyed by a monotonic id.
/// The pump iterates it with a snapshot-free enumerator (which never throws on
/// concurrent add/remove), and / lease disposal mutate it
/// without any lock held across an await. Each subscriber channel has a
/// single writer — the pump — so per-channel writes never race. MXAccess parity:
/// events are fanned in the order received; the pump never reorders or
/// synthesizes events.
///
///
public sealed class SessionEventDistributor : IAsyncDisposable
{
///
/// Bounded wait for the pump to stop during disposal. A source factory that
/// ignores cancellation must not hang dispose forever; after this window the
/// pump is abandoned and subscribers are completed anyway.
///
private static readonly TimeSpan DefaultShutdownTimeout = TimeSpan.FromSeconds(5);
private readonly string _sessionId;
private readonly Func> _eventSourceFactory;
private readonly int _subscriberQueueCapacity;
private readonly TimeSpan _shutdownTimeout;
private readonly ILogger _logger;
private readonly TimeProvider _timeProvider;
private readonly ConcurrentDictionary _subscribers = new();
private readonly CancellationTokenSource _shutdownCts = new();
private readonly object _lifecycleLock = new();
// Replay ring buffer. Appended on the pump thread and queried from arbitrary
// threads via TryGetReplayFrom, so every access is under _replayLock. The deque
// keeps events in ascending WorkerSequence order (the pump fans in source order),
// so the oldest retained event is always at the front. Capacity == 0 disables
// retention; RetentionSeconds <= 0 disables age-based eviction.
private readonly int _replayBufferCapacity;
private readonly TimeSpan _replayRetention;
private readonly bool _ageEvictionEnabled;
private readonly LinkedList _replayBuffer = new();
private readonly object _replayLock = new();
private bool _anyEventSeen;
private ulong _highestSequenceSeen;
private long _nextSubscriberId;
private Task? _pumpTask;
private bool _started;
private bool _disposed;
///
/// Initializes a per-session event distributor.
///
/// Owning session id, used only for logging context.
///
/// Factory producing the session's event stream given a cancellation token.
/// The pump consumes this exactly once. See the type remarks for the seam Task 4
/// plugs into.
///
///
/// Bounded capacity of each per-subscriber channel. Mirrors the gRPC event-stream
/// queue capacity shape used today.
///
/// Logger for pump lifecycle diagnostics.
///
/// This overload disables the replay ring buffer (capacity 0). Use the overload
/// taking replay parameters to retain events for reconnect/reattach replay.
/// Kept internal so production wiring (Task 4) cannot accidentally use
/// the no-replay path; tests reach it via InternalsVisibleTo.
///
internal SessionEventDistributor(
string sessionId,
Func> eventSourceFactory,
int subscriberQueueCapacity,
ILogger logger)
: this(
sessionId,
eventSourceFactory,
subscriberQueueCapacity,
replayBufferCapacity: 0,
replayRetentionSeconds: 0,
logger,
TimeProvider.System)
{
}
///
/// Initializes a per-session event distributor with a bounded replay ring buffer.
///
/// Owning session id, used only for logging context.
///
/// Factory producing the session's event stream given a cancellation token.
/// The pump consumes this exactly once. See the type remarks for the seam Task 4
/// plugs into.
///
///
/// Bounded capacity of each per-subscriber channel. Mirrors the gRPC event-stream
/// queue capacity shape used today.
///
///
/// Maximum number of events retained for replay. The oldest retained event is
/// evicted once this count is exceeded. 0 disables retention entirely.
///
///
/// Maximum age, in seconds, of a retained event. Entries older than this are
/// evicted regardless of capacity. 0 (or less) disables age-based eviction.
///
/// Logger for pump lifecycle diagnostics.
///
/// Clock used to timestamp and age-evict replay entries. Inject a fake to make
/// age-eviction deterministic in tests.
///
public SessionEventDistributor(
string sessionId,
Func> eventSourceFactory,
int subscriberQueueCapacity,
int replayBufferCapacity,
double replayRetentionSeconds,
ILogger logger,
TimeProvider timeProvider)
{
ArgumentException.ThrowIfNullOrWhiteSpace(sessionId);
ArgumentNullException.ThrowIfNull(eventSourceFactory);
ArgumentOutOfRangeException.ThrowIfLessThan(subscriberQueueCapacity, 1);
ArgumentOutOfRangeException.ThrowIfNegative(replayBufferCapacity);
ArgumentOutOfRangeException.ThrowIfNegative(replayRetentionSeconds);
ArgumentNullException.ThrowIfNull(logger);
ArgumentNullException.ThrowIfNull(timeProvider);
_sessionId = sessionId;
_eventSourceFactory = eventSourceFactory;
_subscriberQueueCapacity = subscriberQueueCapacity;
_shutdownTimeout = DefaultShutdownTimeout;
_replayBufferCapacity = replayBufferCapacity;
_ageEvictionEnabled = replayRetentionSeconds > 0;
_replayRetention = _ageEvictionEnabled
? TimeSpan.FromSeconds(replayRetentionSeconds)
: TimeSpan.Zero;
_logger = logger;
_timeProvider = timeProvider;
}
///
/// Gets the count of currently-registered subscribers.
///
public int SubscriberCount => _subscribers.Count;
///
/// Starts the background pump. Idempotent — a second call is a no-op.
///
/// Token observed only while starting.
public Task StartAsync(CancellationToken cancellationToken)
{
cancellationToken.ThrowIfCancellationRequested();
lock (_lifecycleLock)
{
ObjectDisposedException.ThrowIf(_disposed, this);
if (_started)
{
return Task.CompletedTask;
}
_started = true;
_pumpTask = Task.Run(() => PumpAsync(_shutdownCts.Token), CancellationToken.None);
}
return Task.CompletedTask;
}
///
/// Registers a new subscriber and returns its lease. The lease exposes the
/// subscriber's and, when disposed, unregisters the
/// subscriber and completes its channel without disturbing the pump or other
/// subscribers.
///
public IEventSubscriberLease Register()
{
// The pump is the single writer for this channel; readers are single-consumer
// (one gRPC stream / dashboard subscriber). Synchronous continuations are
// disabled so a slow reader can never stall the pump on its completion.
//
// FullMode is Wait but the pump currently writes with TryWrite (drop-on-full):
// these are deliberately opposite policies and only a placeholder. Task 5 owns
// the overflow policy and will reconcile them by either switching the pump to
// WriteAsync (true backpressure, honouring Wait) or changing this to a Drop mode.
// Do not "fix" the mismatch here — leave the decision to Task 5.
Channel channel = Channel.CreateBounded(
new BoundedChannelOptions(_subscriberQueueCapacity)
{
SingleReader = true,
SingleWriter = true,
FullMode = BoundedChannelFullMode.Wait,
AllowSynchronousContinuations = false,
});
long id = Interlocked.Increment(ref _nextSubscriberId);
Subscriber subscriber = new(id, channel);
// The disposed check AND the map add happen under the same lock with no await
// in between. DisposeAsync sets _disposed=true under this same lock before it
// calls CompleteAllSubscribers, so once disposal has begun no further subscriber
// can be added — closing the Register-after-DisposeAsync window that would
// otherwise leave a subscriber's channel never completed.
lock (_lifecycleLock)
{
ObjectDisposedException.ThrowIf(_disposed, this);
_subscribers[id] = subscriber;
}
return new SubscriberLease(this, subscriber);
}
///
/// Stops the pump and completes all subscriber channels. Idempotent.
///
public async ValueTask DisposeAsync()
{
Task? pumpTask;
lock (_lifecycleLock)
{
if (_disposed)
{
return;
}
_disposed = true;
pumpTask = _pumpTask;
}
// Signal the pump to stop. It must not block on a non-reading subscriber:
// it writes with non-blocking TryWrite, so cancellation tears it down promptly.
await _shutdownCts.CancelAsync().ConfigureAwait(false);
if (pumpTask is not null)
{
// Bound the wait: a source factory that ignores cancellation would otherwise
// hang dispose forever. If the pump does not stop in time we log and proceed
// to complete subscribers anyway; DisposeAsync must not throw on this path.
Task completed = await Task.WhenAny(pumpTask, Task.Delay(_shutdownTimeout)).ConfigureAwait(false);
if (!ReferenceEquals(completed, pumpTask))
{
_logger.LogWarning(
"Event distributor pump did not stop within {ShutdownTimeoutSeconds}s for session {SessionId}; completing subscribers and abandoning the pump.",
_shutdownTimeout.TotalSeconds,
_sessionId);
}
else
{
try
{
await pumpTask.ConfigureAwait(false);
}
catch (OperationCanceledException)
{
}
catch (Exception exception)
{
_logger.LogDebug(
exception,
"Event distributor pump faulted during shutdown for session {SessionId}.",
_sessionId);
}
}
}
CompleteAllSubscribers(error: null);
_shutdownCts.Dispose();
}
private async Task PumpAsync(CancellationToken cancellationToken)
{
try
{
await foreach (MxEvent mxEvent in _eventSourceFactory(cancellationToken)
.WithCancellation(cancellationToken)
.ConfigureAwait(false))
{
// Retain for replay BEFORE fan-out so a reconnecting subscriber that
// queries between fan-out and its own read still sees this event. Order
// is preserved: the pump is the single appender and events arrive in
// source order.
AppendToReplayBuffer(mxEvent);
// Enumerating a ConcurrentDictionary's Values never throws on concurrent
// add/remove; a subscriber registered mid-iteration may miss this event,
// which matches "late subscribers see events after they register".
foreach (Subscriber subscriber in _subscribers.Values)
{
// TODO(Task 5): define overflow policy (per-subscriber isolation —
// drop / disconnect / fault that one subscriber). For the Task 2
// skeleton, a non-blocking TryWrite that silently drops on a full
// channel is the placeholder so one slow reader never stalls the pump.
if (!subscriber.Channel.Writer.TryWrite(mxEvent))
{
// Visibility only — Task 5 owns the actual drop/backpressure policy.
// Logs identifiers (worker sequence, subscriber id, session) only,
// never the event payload or tag values.
_logger.LogDebug(
"Event distributor dropped event (worker sequence {WorkerSequence}) for subscriber {SubscriberId} in session {SessionId}: channel full.",
mxEvent.WorkerSequence,
subscriber.Id,
_sessionId);
}
}
}
CompleteAllSubscribers(error: null);
}
catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested)
{
// Shutdown path: DisposeAsync completes subscribers.
}
catch (Exception exception)
{
// Unexpected source fault (not the shutdown-cancellation path above) — visible
// by default so an event stream silently dying is not lost in Debug noise.
_logger.LogError(
exception,
"Event distributor source faulted for session {SessionId}.",
_sessionId);
CompleteAllSubscribers(exception);
}
}
private void CompleteAllSubscribers(Exception? error)
{
foreach (Subscriber subscriber in _subscribers.Values)
{
subscriber.Channel.Writer.TryComplete(error);
}
}
private void Unregister(Subscriber subscriber)
{
if (_subscribers.TryRemove(subscriber.Id, out _))
{
subscriber.Channel.Writer.TryComplete();
}
}
///
/// Returns the retained events with strictly
/// greater than , in ascending sequence order, so a
/// reconnecting or reattaching subscriber can replay what it missed.
///
///
/// The last worker sequence the caller already observed. Only events newer than this
/// are returned.
///
///
/// The retained events newer than , in order. Never
/// null; empty when nothing newer is retained.
///
///
/// when events between and the
/// oldest retained event were already evicted (by capacity or age), meaning the caller
/// missed events that can no longer be replayed and must re-snapshot. When
/// , whatever IS still retained is still returned via
/// .
///
///
/// Always — the out parameters fully describe the result. The
/// return value exists for a fluent call shape and future extension.
///
///
/// Gap semantics, by buffer state:
///
/// -
/// Buffer non-empty: is iff
/// is below the oldest retained sequence minus
/// one (i.e. at least one event newer than but
/// older than the oldest retained was evicted). When
/// equals or exceeds the newest retained
/// sequence the caller is fully caught up: empty list, no gap.
///
/// -
/// Buffer empty (retention disabled, nothing seen yet, or everything evicted):
/// empty list, and is iff
/// is below the highest sequence ever seen —
/// i.e. the caller is behind but nothing is retained to replay. If no event has
/// ever been seen, or the caller is already at/ahead of the highest seen, there
/// is nothing to miss: no gap.
///
///
///
public bool TryGetReplayFrom(ulong afterSequence, out IReadOnlyList events, out bool gap)
{
lock (_replayLock)
{
EvictAged();
if (_replayBuffer.Count == 0)
{
events = [];
// Nothing retained. The caller missed events only if it is behind the
// highest sequence ever seen (and we have seen at least one event).
gap = _anyEventSeen && afterSequence < _highestSequenceSeen;
return true;
}
ulong oldestRetained = _replayBuffer.First!.Value.Event.WorkerSequence;
// A gap exists when at least one event newer than afterSequence was evicted,
// i.e. afterSequence sits below the oldest-retained-minus-one boundary.
// Written as (oldestRetained > 0 && afterSequence < oldestRetained - 1) to
// avoid wrapping when afterSequence == ulong.MaxValue (afterSequence + 1
// would overflow to 0, falsely reporting a gap).
gap = oldestRetained > 0 && afterSequence < oldestRetained - 1;
// O(n) scan over the retained buffer — acceptable because TryGetReplayFrom
// is only called on subscriber reconnect, never on the hot fan-out path.
List newer = [];
foreach (ReplayEntry entry in _replayBuffer)
{
if (entry.Event.WorkerSequence > afterSequence)
{
newer.Add(entry.Event);
}
}
events = newer;
return true;
}
}
private void AppendToReplayBuffer(MxEvent mxEvent)
{
lock (_replayLock)
{
_anyEventSeen = true;
if (mxEvent.WorkerSequence > _highestSequenceSeen)
{
_highestSequenceSeen = mxEvent.WorkerSequence;
}
// Capacity 0 disables retention: track the highest-seen sequence (so replay
// can still report a gap) but keep no events.
if (_replayBufferCapacity == 0)
{
return;
}
_replayBuffer.AddLast(new ReplayEntry(mxEvent, _timeProvider.GetUtcNow()));
// Capacity eviction: drop oldest until within bound.
while (_replayBuffer.Count > _replayBufferCapacity)
{
_replayBuffer.RemoveFirst();
}
EvictAged();
}
}
// Must be called under _replayLock. Drops entries older than the retention window.
private void EvictAged()
{
if (!_ageEvictionEnabled || _replayBuffer.Count == 0)
{
return;
}
DateTimeOffset cutoff = _timeProvider.GetUtcNow() - _replayRetention;
while (_replayBuffer.First is { } first && first.Value.RetainedAt < cutoff)
{
_replayBuffer.RemoveFirst();
}
}
private readonly record struct ReplayEntry(MxEvent Event, DateTimeOffset RetainedAt);
private sealed class Subscriber(long id, Channel channel)
{
public long Id { get; } = id;
public Channel Channel { get; } = channel;
}
private sealed class SubscriberLease(SessionEventDistributor distributor, Subscriber subscriber)
: IEventSubscriberLease
{
private int _leaseDisposed;
public ChannelReader Reader => subscriber.Channel.Reader;
public void Dispose()
{
// Atomic check-and-set so concurrent Dispose calls unregister at most once.
if (Interlocked.Exchange(ref _leaseDisposed, 1) == 0)
{
distributor.Unregister(subscriber);
}
}
}
}