feat(sessions): add bounded replay ring buffer to SessionEventDistributor

This commit is contained in:
Joseph Doherty
2026-06-15 12:42:15 -04:00
parent 7773bdebbd
commit e962737d2c
6 changed files with 375 additions and 3 deletions
@@ -11,4 +11,20 @@ public sealed class EventOptions
/// Gets the backpressure policy for event queue overflow.
/// </summary>
public EventBackpressurePolicy BackpressurePolicy { get; init; } = EventBackpressurePolicy.FailFast;
/// <summary>
/// Gets the maximum number of events retained in the per-session replay ring buffer
/// used to re-deliver events a returning subscriber missed (reconnect/reattach).
/// When the buffer exceeds this count the oldest retained events are evicted first.
/// A value of <c>0</c> disables replay retention entirely.
/// </summary>
public int ReplayBufferCapacity { get; init; } = 1024;
/// <summary>
/// Gets the maximum age, in seconds, of an event retained in the per-session replay
/// ring buffer. Entries older than this are evicted regardless of capacity. A value
/// of <c>0</c> disables age-based eviction (only <see cref="ReplayBufferCapacity"/>
/// bounds the buffer).
/// </summary>
public double ReplayRetentionSeconds { get; init; } = 300;
}
@@ -193,6 +193,16 @@ public sealed class GatewayOptionsValidator : OptionsValidatorBase<GatewayOption
{
builder.Add("MxGateway:Events:BackpressurePolicy must be a supported backpressure policy.");
}
// ReplayBufferCapacity and ReplayRetentionSeconds are bounds on the replay ring
// buffer; 0 is a valid value (disables that dimension), so only negatives fail.
AddIfNegative(
options.ReplayBufferCapacity,
"MxGateway:Events:ReplayBufferCapacity must be greater than or equal to zero.",
builder);
builder.RequireThat(
options.ReplayRetentionSeconds >= 0,
"MxGateway:Events:ReplayRetentionSeconds must be greater than or equal to zero.");
}
private static void ValidateDashboard(DashboardOptions options, ValidationBuilder builder)
@@ -55,10 +55,24 @@ public sealed class SessionEventDistributor : IAsyncDisposable
private readonly int _subscriberQueueCapacity;
private readonly TimeSpan _shutdownTimeout;
private readonly ILogger<SessionEventDistributor> _logger;
private readonly TimeProvider _timeProvider;
private readonly ConcurrentDictionary<long, Subscriber> _subscribers = new();
private readonly CancellationTokenSource _shutdownCts = new();
private readonly object _lifecycleLock = new();
// Replay ring buffer. Appended on the pump thread and queried from arbitrary
// threads via TryGetReplayFrom, so every access is under _replayLock. The deque
// keeps events in ascending WorkerSequence order (the pump fans in source order),
// so the oldest retained event is always at the front. Capacity == 0 disables
// retention; RetentionSeconds <= 0 disables age-based eviction.
private readonly int _replayBufferCapacity;
private readonly TimeSpan _replayRetention;
private readonly bool _ageEvictionEnabled;
private readonly LinkedList<ReplayEntry> _replayBuffer = new();
private readonly object _replayLock = new();
private bool _anyEventSeen;
private ulong _highestSequenceSeen;
private long _nextSubscriberId;
private Task? _pumpTask;
private bool _started;
@@ -78,22 +92,80 @@ public sealed class SessionEventDistributor : IAsyncDisposable
/// queue capacity shape used today.
/// </param>
/// <param name="logger">Logger for pump lifecycle diagnostics.</param>
/// <remarks>
/// This overload disables the replay ring buffer (capacity 0). Use the overload
/// taking replay parameters to retain events for reconnect/reattach replay.
/// </remarks>
public SessionEventDistributor(
string sessionId,
Func<CancellationToken, IAsyncEnumerable<MxEvent>> eventSourceFactory,
int subscriberQueueCapacity,
ILogger<SessionEventDistributor> logger)
: this(
sessionId,
eventSourceFactory,
subscriberQueueCapacity,
replayBufferCapacity: 0,
replayRetentionSeconds: 0,
logger,
TimeProvider.System)
{
}
/// <summary>
/// Initializes a per-session event distributor with a bounded replay ring buffer.
/// </summary>
/// <param name="sessionId">Owning session id, used only for logging context.</param>
/// <param name="eventSourceFactory">
/// Factory producing the session's event stream given a cancellation token.
/// The pump consumes this exactly once. See the type remarks for the seam Task 4
/// plugs into.
/// </param>
/// <param name="subscriberQueueCapacity">
/// Bounded capacity of each per-subscriber channel. Mirrors the gRPC event-stream
/// queue capacity shape used today.
/// </param>
/// <param name="replayBufferCapacity">
/// Maximum number of events retained for replay. The oldest retained event is
/// evicted once this count is exceeded. <c>0</c> disables retention entirely.
/// </param>
/// <param name="replayRetentionSeconds">
/// Maximum age, in seconds, of a retained event. Entries older than this are
/// evicted regardless of capacity. <c>0</c> (or less) disables age-based eviction.
/// </param>
/// <param name="logger">Logger for pump lifecycle diagnostics.</param>
/// <param name="timeProvider">
/// Clock used to timestamp and age-evict replay entries. Inject a fake to make
/// age-eviction deterministic in tests.
/// </param>
public SessionEventDistributor(
string sessionId,
Func<CancellationToken, IAsyncEnumerable<MxEvent>> eventSourceFactory,
int subscriberQueueCapacity,
int replayBufferCapacity,
double replayRetentionSeconds,
ILogger<SessionEventDistributor> logger,
TimeProvider timeProvider)
{
ArgumentException.ThrowIfNullOrWhiteSpace(sessionId);
ArgumentNullException.ThrowIfNull(eventSourceFactory);
ArgumentOutOfRangeException.ThrowIfLessThan(subscriberQueueCapacity, 1);
ArgumentOutOfRangeException.ThrowIfNegative(replayBufferCapacity);
ArgumentOutOfRangeException.ThrowIfNegative(replayRetentionSeconds);
ArgumentNullException.ThrowIfNull(logger);
ArgumentNullException.ThrowIfNull(timeProvider);
_sessionId = sessionId;
_eventSourceFactory = eventSourceFactory;
_subscriberQueueCapacity = subscriberQueueCapacity;
_shutdownTimeout = DefaultShutdownTimeout;
_replayBufferCapacity = replayBufferCapacity;
_ageEvictionEnabled = replayRetentionSeconds > 0;
_replayRetention = _ageEvictionEnabled
? TimeSpan.FromSeconds(replayRetentionSeconds)
: TimeSpan.Zero;
_logger = logger;
_timeProvider = timeProvider;
}
/// <summary>
@@ -232,6 +304,12 @@ public sealed class SessionEventDistributor : IAsyncDisposable
.WithCancellation(cancellationToken)
.ConfigureAwait(false))
{
// Retain for replay BEFORE fan-out so a reconnecting subscriber that
// queries between fan-out and its own read still sees this event. Order
// is preserved: the pump is the single appender and events arrive in
// source order.
AppendToReplayBuffer(mxEvent);
// Enumerating a ConcurrentDictionary's Values never throws on concurrent
// add/remove; a subscriber registered mid-iteration may miss this event,
// which matches "late subscribers see events after they register".
@@ -289,6 +367,132 @@ public sealed class SessionEventDistributor : IAsyncDisposable
}
}
/// <summary>
/// Returns the retained events with <see cref="MxEvent.WorkerSequence"/> strictly
/// greater than <paramref name="afterSequence"/>, in ascending sequence order, so a
/// reconnecting or reattaching subscriber can replay what it missed.
/// </summary>
/// <param name="afterSequence">
/// The last worker sequence the caller already observed. Only events newer than this
/// are returned.
/// </param>
/// <param name="events">
/// The retained events newer than <paramref name="afterSequence"/>, in order. Never
/// null; empty when nothing newer is retained.
/// </param>
/// <param name="gap">
/// <see langword="true"/> when events between <paramref name="afterSequence"/> and the
/// oldest retained event were already evicted (by capacity or age), meaning the caller
/// missed events that can no longer be replayed and must re-snapshot. When
/// <see langword="true"/>, whatever IS still retained is still returned via
/// <paramref name="events"/>.
/// </param>
/// <returns>
/// Always <see langword="true"/> — the out parameters fully describe the result. The
/// return value exists for a fluent call shape and future extension.
/// </returns>
/// <remarks>
/// <para>Gap semantics, by buffer state:</para>
/// <list type="bullet">
/// <item>
/// Buffer non-empty: <paramref name="gap"/> is <see langword="true"/> iff
/// <paramref name="afterSequence"/> is below the oldest retained sequence minus
/// one (i.e. at least one event newer than <paramref name="afterSequence"/> but
/// older than the oldest retained was evicted). When
/// <paramref name="afterSequence"/> equals or exceeds the newest retained
/// sequence the caller is fully caught up: empty list, no gap.
/// </item>
/// <item>
/// Buffer empty (retention disabled, nothing seen yet, or everything evicted):
/// empty list, and <paramref name="gap"/> is <see langword="true"/> iff
/// <paramref name="afterSequence"/> is below the highest sequence ever seen —
/// i.e. the caller is behind but nothing is retained to replay. If no event has
/// ever been seen, or the caller is already at/ahead of the highest seen, there
/// is nothing to miss: no gap.
/// </item>
/// </list>
/// </remarks>
public bool TryGetReplayFrom(ulong afterSequence, out IReadOnlyList<MxEvent> events, out bool gap)
{
lock (_replayLock)
{
EvictAged();
if (_replayBuffer.Count == 0)
{
events = [];
// Nothing retained. The caller missed events only if it is behind the
// highest sequence ever seen (and we have seen at least one event).
gap = _anyEventSeen && afterSequence < _highestSequenceSeen;
return true;
}
ulong oldestRetained = _replayBuffer.First!.Value.Event.WorkerSequence;
// A gap exists when at least one event newer than afterSequence was evicted,
// i.e. afterSequence sits below the oldest-retained-minus-one boundary.
gap = afterSequence + 1 < oldestRetained;
List<MxEvent> newer = [];
foreach (ReplayEntry entry in _replayBuffer)
{
if (entry.Event.WorkerSequence > afterSequence)
{
newer.Add(entry.Event);
}
}
events = newer;
return true;
}
}
private void AppendToReplayBuffer(MxEvent mxEvent)
{
lock (_replayLock)
{
_anyEventSeen = true;
if (mxEvent.WorkerSequence > _highestSequenceSeen)
{
_highestSequenceSeen = mxEvent.WorkerSequence;
}
// Capacity 0 disables retention: track the highest-seen sequence (so replay
// can still report a gap) but keep no events.
if (_replayBufferCapacity == 0)
{
return;
}
_replayBuffer.AddLast(new ReplayEntry(mxEvent, _timeProvider.GetUtcNow()));
// Capacity eviction: drop oldest until within bound.
while (_replayBuffer.Count > _replayBufferCapacity)
{
_replayBuffer.RemoveFirst();
}
EvictAged();
}
}
// Must be called under _replayLock. Drops entries older than the retention window.
private void EvictAged()
{
if (!_ageEvictionEnabled || _replayBuffer.Count == 0)
{
return;
}
DateTimeOffset cutoff = _timeProvider.GetUtcNow() - _replayRetention;
while (_replayBuffer.First is { } first && first.Value.RetainedAt < cutoff)
{
_replayBuffer.RemoveFirst();
}
}
private readonly record struct ReplayEntry(MxEvent Event, DateTimeOffset RetainedAt);
private sealed class Subscriber(long id, Channel<MxEvent> channel)
{
public long Id { get; } = id;
@@ -50,7 +50,9 @@
},
"Events": {
"QueueCapacity": 10000,
"BackpressurePolicy": "FailFast"
"BackpressurePolicy": "FailFast",
"ReplayBufferCapacity": 1024,
"ReplayRetentionSeconds": 300
},
"Dashboard": {
"Enabled": true,