using System.Collections.Concurrent; using System.Threading.Channels; using ZB.MOM.WW.MxGateway.Contracts.Proto; namespace ZB.MOM.WW.MxGateway.Server.Sessions; /// /// Per-session event pump and fan-out. A single background task drains the /// session's event source exactly once and fans each event out to /// every currently-registered subscriber's own bounded channel. /// /// /// /// Introduced by Task 2 of the Session Resilience epic; the bounded replay ring /// buffer was added by Task 3. The class is NOT yet wired into /// GatewaySession or EventStreamService (Task 4), has no /// per-subscriber backpressure-isolation policy (Task 5), and does not remove /// the single-subscriber guard (Tasks 7/8). The ring buffer supports capacity /// eviction (oldest entry dropped when the count exceeds /// replayBufferCapacity) and age eviction (entries older than /// replayRetentionSeconds dropped on the next append or query), and is /// queried via by reconnecting subscribers. /// /// /// Source seam. The event source is injected as a /// producing an /// of already-mapped public /// s, given a . This is the /// cleanest seam for Task 4: it can pass /// ct => session.ReadEventsAsync(ct).Select(mapper.MapEvent) (or a /// channel reader's ReadAllAsync), while unit tests pass a plain /// channel reader's ReadAllAsync with no real session. The pump owns the /// single consumption of this enumerable; fan-out happens on the public /// after mapping, mirroring today's /// EventStreamService.ProduceEventsAsync ordering. /// /// /// Concurrency. The subscriber set is a /// keyed by a monotonic id. /// The pump iterates it with a snapshot-free enumerator (which never throws on /// concurrent add/remove), and / lease disposal mutate it /// without any lock held across an await. Each subscriber channel has a /// single writer — the pump — so per-channel writes never race. MXAccess parity: /// events are fanned in the order received; the pump never reorders or /// synthesizes events. /// /// public sealed class SessionEventDistributor : IAsyncDisposable { /// /// Bounded wait for the pump to stop during disposal. A source factory that /// ignores cancellation must not hang dispose forever; after this window the /// pump is abandoned and subscribers are completed anyway. /// private static readonly TimeSpan DefaultShutdownTimeout = TimeSpan.FromSeconds(5); private readonly string _sessionId; private readonly Func> _eventSourceFactory; private readonly int _subscriberQueueCapacity; private readonly TimeSpan _shutdownTimeout; private readonly ILogger _logger; private readonly TimeProvider _timeProvider; private readonly ConcurrentDictionary _subscribers = new(); private readonly CancellationTokenSource _shutdownCts = new(); private readonly object _lifecycleLock = new(); // Replay ring buffer. Appended on the pump thread and queried from arbitrary // threads via TryGetReplayFrom, so every access is under _replayLock. The deque // keeps events in ascending WorkerSequence order (the pump fans in source order), // so the oldest retained event is always at the front. Capacity == 0 disables // retention; RetentionSeconds <= 0 disables age-based eviction. private readonly int _replayBufferCapacity; private readonly TimeSpan _replayRetention; private readonly bool _ageEvictionEnabled; private readonly LinkedList _replayBuffer = new(); private readonly object _replayLock = new(); private bool _anyEventSeen; private ulong _highestSequenceSeen; private long _nextSubscriberId; private Task? _pumpTask; private bool _started; private bool _disposed; /// /// Initializes a per-session event distributor. /// /// Owning session id, used only for logging context. /// /// Factory producing the session's event stream given a cancellation token. /// The pump consumes this exactly once. See the type remarks for the seam Task 4 /// plugs into. /// /// /// Bounded capacity of each per-subscriber channel. Mirrors the gRPC event-stream /// queue capacity shape used today. /// /// Logger for pump lifecycle diagnostics. /// /// This overload disables the replay ring buffer (capacity 0). Use the overload /// taking replay parameters to retain events for reconnect/reattach replay. /// Kept internal so production wiring (Task 4) cannot accidentally use /// the no-replay path; tests reach it via InternalsVisibleTo. /// internal SessionEventDistributor( string sessionId, Func> eventSourceFactory, int subscriberQueueCapacity, ILogger logger) : this( sessionId, eventSourceFactory, subscriberQueueCapacity, replayBufferCapacity: 0, replayRetentionSeconds: 0, logger, TimeProvider.System) { } /// /// Initializes a per-session event distributor with a bounded replay ring buffer. /// /// Owning session id, used only for logging context. /// /// Factory producing the session's event stream given a cancellation token. /// The pump consumes this exactly once. See the type remarks for the seam Task 4 /// plugs into. /// /// /// Bounded capacity of each per-subscriber channel. Mirrors the gRPC event-stream /// queue capacity shape used today. /// /// /// Maximum number of events retained for replay. The oldest retained event is /// evicted once this count is exceeded. 0 disables retention entirely. /// /// /// Maximum age, in seconds, of a retained event. Entries older than this are /// evicted regardless of capacity. 0 (or less) disables age-based eviction. /// /// Logger for pump lifecycle diagnostics. /// /// Clock used to timestamp and age-evict replay entries. Inject a fake to make /// age-eviction deterministic in tests. /// public SessionEventDistributor( string sessionId, Func> eventSourceFactory, int subscriberQueueCapacity, int replayBufferCapacity, double replayRetentionSeconds, ILogger logger, TimeProvider timeProvider) { ArgumentException.ThrowIfNullOrWhiteSpace(sessionId); ArgumentNullException.ThrowIfNull(eventSourceFactory); ArgumentOutOfRangeException.ThrowIfLessThan(subscriberQueueCapacity, 1); ArgumentOutOfRangeException.ThrowIfNegative(replayBufferCapacity); ArgumentOutOfRangeException.ThrowIfNegative(replayRetentionSeconds); ArgumentNullException.ThrowIfNull(logger); ArgumentNullException.ThrowIfNull(timeProvider); _sessionId = sessionId; _eventSourceFactory = eventSourceFactory; _subscriberQueueCapacity = subscriberQueueCapacity; _shutdownTimeout = DefaultShutdownTimeout; _replayBufferCapacity = replayBufferCapacity; _ageEvictionEnabled = replayRetentionSeconds > 0; _replayRetention = _ageEvictionEnabled ? TimeSpan.FromSeconds(replayRetentionSeconds) : TimeSpan.Zero; _logger = logger; _timeProvider = timeProvider; } /// /// Gets the count of currently-registered subscribers. /// public int SubscriberCount => _subscribers.Count; /// /// Starts the background pump. Idempotent — a second call is a no-op. /// /// Token observed only while starting. public Task StartAsync(CancellationToken cancellationToken) { cancellationToken.ThrowIfCancellationRequested(); lock (_lifecycleLock) { ObjectDisposedException.ThrowIf(_disposed, this); if (_started) { return Task.CompletedTask; } _started = true; _pumpTask = Task.Run(() => PumpAsync(_shutdownCts.Token), CancellationToken.None); } return Task.CompletedTask; } /// /// Registers a new subscriber and returns its lease. The lease exposes the /// subscriber's and, when disposed, unregisters the /// subscriber and completes its channel without disturbing the pump or other /// subscribers. /// public IEventSubscriberLease Register() { // The pump is the single writer for this channel; readers are single-consumer // (one gRPC stream / dashboard subscriber). Synchronous continuations are // disabled so a slow reader can never stall the pump on its completion. // // FullMode is Wait but the pump currently writes with TryWrite (drop-on-full): // these are deliberately opposite policies and only a placeholder. Task 5 owns // the overflow policy and will reconcile them by either switching the pump to // WriteAsync (true backpressure, honouring Wait) or changing this to a Drop mode. // Do not "fix" the mismatch here — leave the decision to Task 5. Channel channel = Channel.CreateBounded( new BoundedChannelOptions(_subscriberQueueCapacity) { SingleReader = true, SingleWriter = true, FullMode = BoundedChannelFullMode.Wait, AllowSynchronousContinuations = false, }); long id = Interlocked.Increment(ref _nextSubscriberId); Subscriber subscriber = new(id, channel); // The disposed check AND the map add happen under the same lock with no await // in between. DisposeAsync sets _disposed=true under this same lock before it // calls CompleteAllSubscribers, so once disposal has begun no further subscriber // can be added — closing the Register-after-DisposeAsync window that would // otherwise leave a subscriber's channel never completed. lock (_lifecycleLock) { ObjectDisposedException.ThrowIf(_disposed, this); _subscribers[id] = subscriber; } return new SubscriberLease(this, subscriber); } /// /// Stops the pump and completes all subscriber channels. Idempotent. /// public async ValueTask DisposeAsync() { Task? pumpTask; lock (_lifecycleLock) { if (_disposed) { return; } _disposed = true; pumpTask = _pumpTask; } // Signal the pump to stop. It must not block on a non-reading subscriber: // it writes with non-blocking TryWrite, so cancellation tears it down promptly. await _shutdownCts.CancelAsync().ConfigureAwait(false); if (pumpTask is not null) { // Bound the wait: a source factory that ignores cancellation would otherwise // hang dispose forever. If the pump does not stop in time we log and proceed // to complete subscribers anyway; DisposeAsync must not throw on this path. Task completed = await Task.WhenAny(pumpTask, Task.Delay(_shutdownTimeout)).ConfigureAwait(false); if (!ReferenceEquals(completed, pumpTask)) { _logger.LogWarning( "Event distributor pump did not stop within {ShutdownTimeoutSeconds}s for session {SessionId}; completing subscribers and abandoning the pump.", _shutdownTimeout.TotalSeconds, _sessionId); } else { try { await pumpTask.ConfigureAwait(false); } catch (OperationCanceledException) { } catch (Exception exception) { _logger.LogDebug( exception, "Event distributor pump faulted during shutdown for session {SessionId}.", _sessionId); } } } CompleteAllSubscribers(error: null); _shutdownCts.Dispose(); } private async Task PumpAsync(CancellationToken cancellationToken) { try { await foreach (MxEvent mxEvent in _eventSourceFactory(cancellationToken) .WithCancellation(cancellationToken) .ConfigureAwait(false)) { // Retain for replay BEFORE fan-out so a reconnecting subscriber that // queries between fan-out and its own read still sees this event. Order // is preserved: the pump is the single appender and events arrive in // source order. AppendToReplayBuffer(mxEvent); // Enumerating a ConcurrentDictionary's Values never throws on concurrent // add/remove; a subscriber registered mid-iteration may miss this event, // which matches "late subscribers see events after they register". foreach (Subscriber subscriber in _subscribers.Values) { // TODO(Task 5): define overflow policy (per-subscriber isolation — // drop / disconnect / fault that one subscriber). For the Task 2 // skeleton, a non-blocking TryWrite that silently drops on a full // channel is the placeholder so one slow reader never stalls the pump. if (!subscriber.Channel.Writer.TryWrite(mxEvent)) { // Visibility only — Task 5 owns the actual drop/backpressure policy. // Logs identifiers (worker sequence, subscriber id, session) only, // never the event payload or tag values. _logger.LogDebug( "Event distributor dropped event (worker sequence {WorkerSequence}) for subscriber {SubscriberId} in session {SessionId}: channel full.", mxEvent.WorkerSequence, subscriber.Id, _sessionId); } } } CompleteAllSubscribers(error: null); } catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested) { // Shutdown path: DisposeAsync completes subscribers. } catch (Exception exception) { // Unexpected source fault (not the shutdown-cancellation path above) — visible // by default so an event stream silently dying is not lost in Debug noise. _logger.LogError( exception, "Event distributor source faulted for session {SessionId}.", _sessionId); CompleteAllSubscribers(exception); } } private void CompleteAllSubscribers(Exception? error) { foreach (Subscriber subscriber in _subscribers.Values) { subscriber.Channel.Writer.TryComplete(error); } } private void Unregister(Subscriber subscriber) { if (_subscribers.TryRemove(subscriber.Id, out _)) { subscriber.Channel.Writer.TryComplete(); } } /// /// Returns the retained events with strictly /// greater than , in ascending sequence order, so a /// reconnecting or reattaching subscriber can replay what it missed. /// /// /// The last worker sequence the caller already observed. Only events newer than this /// are returned. /// /// /// The retained events newer than , in order. Never /// null; empty when nothing newer is retained. /// /// /// when events between and the /// oldest retained event were already evicted (by capacity or age), meaning the caller /// missed events that can no longer be replayed and must re-snapshot. When /// , whatever IS still retained is still returned via /// . /// /// /// Always — the out parameters fully describe the result. The /// return value exists for a fluent call shape and future extension. /// /// /// Gap semantics, by buffer state: /// /// /// Buffer non-empty: is iff /// is below the oldest retained sequence minus /// one (i.e. at least one event newer than but /// older than the oldest retained was evicted). When /// equals or exceeds the newest retained /// sequence the caller is fully caught up: empty list, no gap. /// /// /// Buffer empty (retention disabled, nothing seen yet, or everything evicted): /// empty list, and is iff /// is below the highest sequence ever seen — /// i.e. the caller is behind but nothing is retained to replay. If no event has /// ever been seen, or the caller is already at/ahead of the highest seen, there /// is nothing to miss: no gap. /// /// /// public bool TryGetReplayFrom(ulong afterSequence, out IReadOnlyList events, out bool gap) { lock (_replayLock) { EvictAged(); if (_replayBuffer.Count == 0) { events = []; // Nothing retained. The caller missed events only if it is behind the // highest sequence ever seen (and we have seen at least one event). gap = _anyEventSeen && afterSequence < _highestSequenceSeen; return true; } ulong oldestRetained = _replayBuffer.First!.Value.Event.WorkerSequence; // A gap exists when at least one event newer than afterSequence was evicted, // i.e. afterSequence sits below the oldest-retained-minus-one boundary. // Written as (oldestRetained > 0 && afterSequence < oldestRetained - 1) to // avoid wrapping when afterSequence == ulong.MaxValue (afterSequence + 1 // would overflow to 0, falsely reporting a gap). gap = oldestRetained > 0 && afterSequence < oldestRetained - 1; // O(n) scan over the retained buffer — acceptable because TryGetReplayFrom // is only called on subscriber reconnect, never on the hot fan-out path. List newer = []; foreach (ReplayEntry entry in _replayBuffer) { if (entry.Event.WorkerSequence > afterSequence) { newer.Add(entry.Event); } } events = newer; return true; } } private void AppendToReplayBuffer(MxEvent mxEvent) { lock (_replayLock) { _anyEventSeen = true; if (mxEvent.WorkerSequence > _highestSequenceSeen) { _highestSequenceSeen = mxEvent.WorkerSequence; } // Capacity 0 disables retention: track the highest-seen sequence (so replay // can still report a gap) but keep no events. if (_replayBufferCapacity == 0) { return; } _replayBuffer.AddLast(new ReplayEntry(mxEvent, _timeProvider.GetUtcNow())); // Capacity eviction: drop oldest until within bound. while (_replayBuffer.Count > _replayBufferCapacity) { _replayBuffer.RemoveFirst(); } EvictAged(); } } // Must be called under _replayLock. Drops entries older than the retention window. private void EvictAged() { if (!_ageEvictionEnabled || _replayBuffer.Count == 0) { return; } DateTimeOffset cutoff = _timeProvider.GetUtcNow() - _replayRetention; while (_replayBuffer.First is { } first && first.Value.RetainedAt < cutoff) { _replayBuffer.RemoveFirst(); } } private readonly record struct ReplayEntry(MxEvent Event, DateTimeOffset RetainedAt); private sealed class Subscriber(long id, Channel channel) { public long Id { get; } = id; public Channel Channel { get; } = channel; } private sealed class SubscriberLease(SessionEventDistributor distributor, Subscriber subscriber) : IEventSubscriberLease { private int _leaseDisposed; public ChannelReader Reader => subscriber.Channel.Reader; public void Dispose() { // Atomic check-and-set so concurrent Dispose calls unregister at most once. if (Interlocked.Exchange(ref _leaseDisposed, 1) == 0) { distributor.Unregister(subscriber); } } } }