feat(sessions): replay-on-reconnect with ReplayGap sentinel

This commit is contained in:
Joseph Doherty
2026-06-16 07:22:19 -04:00
parent 042f5e3d82
commit 36ab8d15f1
7 changed files with 736 additions and 29 deletions
@@ -300,6 +300,218 @@ public sealed class EventStreamServiceTests
Assert.Equal(1, metrics.GetSnapshot().Faults);
}
/// <summary>
/// Task 12: resuming with AfterWorkerSequence inside the retained window replays exactly
/// the newer retained events (in order, no dup) then live, with NO ReplayGap sentinel.
/// </summary>
[Fact]
public async Task StreamEventsAsync_ResumeWithinRetainedWindow_ReplaysNewerThenLive_NoSentinel()
{
System.Threading.Channels.Channel<WorkerEvent> live =
System.Threading.Channels.Channel.CreateUnbounded<WorkerEvent>();
FakeWorkerClient workerClient = new() { LiveEvents = live };
for (ulong sequence = 1; sequence <= 5; sequence++)
{
workerClient.Events.Add(CreateWorkerEvent(sequence, MxEventFamily.OnDataChange));
}
GatewaySession session = CreateReadySession(workerClient);
EventStreamService service = CreateService(new FakeSessionManager(session));
// Prime: drain the static 1..5 through a first subscriber so the replay ring retains them.
await PrimeReplayAsync(service, session.SessionId, expectedCount: 5);
// Resume after sequence 2: retained window [1..5] covers it — replay 3,4,5 then live.
await using IAsyncEnumerator<MxEvent> resume = service
.StreamEventsAsync(CreateRequest(session.SessionId, afterWorkerSequence: 2), CancellationToken.None)
.GetAsyncEnumerator();
MxEvent r3 = await ReadNextAsync(resume);
MxEvent r4 = await ReadNextAsync(resume);
MxEvent r5 = await ReadNextAsync(resume);
Assert.Equal(new ulong[] { 3, 4, 5 }, new[] { r3.WorkerSequence, r4.WorkerSequence, r5.WorkerSequence });
Assert.Null(r3.ReplayGap);
// No sentinel anywhere; next is a LIVE event.
live.Writer.TryWrite(CreateWorkerEvent(6, MxEventFamily.OnDataChange));
MxEvent liveEvent = await ReadNextAsync(resume);
Assert.Equal(6ul, liveEvent.WorkerSequence);
Assert.Null(liveEvent.ReplayGap);
}
/// <summary>
/// Task 12: resuming with AfterWorkerSequence older than the oldest retained yields the
/// ReplayGap sentinel FIRST (correct requested/oldest), then the retained tail, then live.
/// </summary>
[Fact]
public async Task StreamEventsAsync_ResumeOlderThanOldestRetained_EmitsSentinelFirst_ThenTailThenLive()
{
System.Threading.Channels.Channel<WorkerEvent> live =
System.Threading.Channels.Channel.CreateUnbounded<WorkerEvent>();
FakeWorkerClient workerClient = new() { LiveEvents = live };
for (ulong sequence = 1; sequence <= 5; sequence++)
{
workerClient.Events.Add(CreateWorkerEvent(sequence, MxEventFamily.OnDataChange));
}
// Replay capacity 3 retains only 3,4,5; 1,2 are evicted.
GatewaySession session = CreateReadySession(workerClient, replayBufferCapacity: 3);
EventStreamService service = CreateService(new FakeSessionManager(session));
await PrimeReplayAsync(service, session.SessionId, expectedCount: 5);
// Resume after 1: events 1,2 are below the oldest retained (3) and were evicted, so
// they are unrecoverable => sentinel first, then the retained tail 3,4,5, then live.
await using IAsyncEnumerator<MxEvent> realResume = service
.StreamEventsAsync(CreateRequest(session.SessionId, afterWorkerSequence: 1), CancellationToken.None)
.GetAsyncEnumerator();
MxEvent sentinel = await ReadNextAsync(realResume);
Assert.NotNull(sentinel.ReplayGap);
Assert.Equal(1ul, sentinel.ReplayGap.RequestedAfterSequence);
Assert.Equal(3ul, sentinel.ReplayGap.OldestAvailableSequence);
Assert.Equal(MxEventFamily.Unspecified, sentinel.Family);
Assert.Equal(session.SessionId, sentinel.SessionId);
MxEvent r3 = await ReadNextAsync(realResume);
MxEvent r4 = await ReadNextAsync(realResume);
MxEvent r5 = await ReadNextAsync(realResume);
Assert.Equal(new ulong[] { 3, 4, 5 }, new[] { r3.WorkerSequence, r4.WorkerSequence, r5.WorkerSequence });
Assert.Null(r3.ReplayGap);
live.Writer.TryWrite(CreateWorkerEvent(6, MxEventFamily.OnDataChange));
MxEvent liveEvent = await ReadNextAsync(realResume);
Assert.Equal(6ul, liveEvent.WorkerSequence);
}
/// <summary>
/// Task 12: the replay→live boundary is contiguous — no duplicate and no skip — even
/// when events span the handoff.
/// </summary>
[Fact]
public async Task StreamEventsAsync_ResumeHandoff_IsContiguous_NoDuplicateNoSkip()
{
System.Threading.Channels.Channel<WorkerEvent> live =
System.Threading.Channels.Channel.CreateUnbounded<WorkerEvent>();
FakeWorkerClient workerClient = new() { LiveEvents = live };
for (ulong sequence = 1; sequence <= 4; sequence++)
{
workerClient.Events.Add(CreateWorkerEvent(sequence, MxEventFamily.OnDataChange));
}
GatewaySession session = CreateReadySession(workerClient);
EventStreamService service = CreateService(new FakeSessionManager(session));
await PrimeReplayAsync(service, session.SessionId, expectedCount: 4);
// Resume after 2: replay 3,4 then live 5,6,7. Collect across the boundary and assert
// the full sequence is contiguous with no duplicate and no skip.
await using IAsyncEnumerator<MxEvent> resume = service
.StreamEventsAsync(CreateRequest(session.SessionId, afterWorkerSequence: 2), CancellationToken.None)
.GetAsyncEnumerator();
List<ulong> collected = [];
collected.Add((await ReadNextAsync(resume)).WorkerSequence); // 3
collected.Add((await ReadNextAsync(resume)).WorkerSequence); // 4
for (ulong sequence = 5; sequence <= 7; sequence++)
{
live.Writer.TryWrite(CreateWorkerEvent(sequence, MxEventFamily.OnDataChange));
collected.Add((await ReadNextAsync(resume)).WorkerSequence);
}
Assert.Equal(new ulong[] { 3, 4, 5, 6, 7 }, collected);
}
/// <summary>
/// Task 12: the per-item filter applies to REPLAYED events identically to live — a
/// replayed event at/below the requested watermark is never delivered.
/// </summary>
[Fact]
public async Task StreamEventsAsync_ResumeReplay_AppliesPerItemFilter_DropsAtOrBelowWatermark()
{
System.Threading.Channels.Channel<WorkerEvent> live =
System.Threading.Channels.Channel.CreateUnbounded<WorkerEvent>();
FakeWorkerClient workerClient = new() { LiveEvents = live };
for (ulong sequence = 1; sequence <= 5; sequence++)
{
workerClient.Events.Add(CreateWorkerEvent(sequence, MxEventFamily.OnDataChange));
}
GatewaySession session = CreateReadySession(workerClient);
EventStreamService service = CreateService(new FakeSessionManager(session));
await PrimeReplayAsync(service, session.SessionId, expectedCount: 5);
// Resume after 3: only 4,5 may be delivered. Events 1,2,3 — present in the ring but at
// or below the watermark — must be filtered out of the replay, never seen. The first two
// reads must be exactly 4 then 5 (no sentinel, no <=3 event); a live tag confirms the
// stream resumed live strictly after 5.
await using IAsyncEnumerator<MxEvent> resume = service
.StreamEventsAsync(CreateRequest(session.SessionId, afterWorkerSequence: 3), CancellationToken.None)
.GetAsyncEnumerator();
MxEvent first = await ReadNextAsync(resume);
MxEvent second = await ReadNextAsync(resume);
Assert.Equal(4ul, first.WorkerSequence);
Assert.Equal(5ul, second.WorkerSequence);
Assert.Null(first.ReplayGap);
Assert.Null(second.ReplayGap);
// The very next delivered event is the live 6 — proving nothing <=3 slipped in and the
// handoff resumed strictly after the replay tail.
live.Writer.TryWrite(CreateWorkerEvent(6, MxEventFamily.OnDataChange));
MxEvent liveEvent = await ReadNextAsync(resume);
Assert.Equal(6ul, liveEvent.WorkerSequence);
}
/// <summary>
/// Task 12: AfterWorkerSequence == 0 is a fresh stream (not a resume) — no replay, no
/// sentinel, just live events as before.
/// </summary>
[Fact]
public async Task StreamEventsAsync_FreshStreamAfterSequenceZero_NoReplayNoSentinel()
{
FakeWorkerClient workerClient = new();
for (ulong sequence = 1; sequence <= 3; sequence++)
{
workerClient.Events.Add(CreateWorkerEvent(sequence, MxEventFamily.OnDataChange));
}
workerClient.CompleteAfterConfiguredEvents = true;
GatewaySession session = CreateReadySession(workerClient);
EventStreamService service = CreateService(new FakeSessionManager(session));
List<MxEvent> events = await CollectEventsAsync(service, session.SessionId);
Assert.Equal(new ulong[] { 1, 2, 3 }, events.Select(e => e.WorkerSequence));
Assert.DoesNotContain(events, e => e.ReplayGap is not null);
}
// Drains the first `expectedCount` events through a throwaway subscriber so the session's
// replay ring retains them, then disposes the subscriber. The pump (started on first
// attach) keeps running for the session, so subsequent resume attaches see the retained
// events.
private static async Task PrimeReplayAsync(
EventStreamService service,
string sessionId,
int expectedCount)
{
await using IAsyncEnumerator<MxEvent> primer = service
.StreamEventsAsync(CreateRequest(sessionId), CancellationToken.None)
.GetAsyncEnumerator();
for (int i = 0; i < expectedCount; i++)
{
await ReadNextAsync(primer);
}
}
private static async Task<MxEvent> ReadNextAsync(IAsyncEnumerator<MxEvent> enumerator)
{
Assert.True(await enumerator.MoveNextAsync().AsTask().WaitAsync(TestTimeout));
return enumerator.Current;
}
private static EventStreamService CreateService(
FakeSessionManager sessionManager,
GatewayMetrics? metrics = null,
@@ -334,11 +546,12 @@ public sealed class EventStreamServiceTests
return events;
}
private static StreamEventsRequest CreateRequest(string sessionId)
private static StreamEventsRequest CreateRequest(string sessionId, ulong afterWorkerSequence = 0)
{
return new StreamEventsRequest
{
SessionId = sessionId,
AfterWorkerSequence = afterWorkerSequence,
};
}
@@ -347,7 +560,8 @@ public sealed class EventStreamServiceTests
string sessionId = "session-events",
int queueCapacity = 8,
GatewayMetrics? metrics = null,
EventBackpressurePolicy backpressurePolicy = EventBackpressurePolicy.FailFast)
EventBackpressurePolicy backpressurePolicy = EventBackpressurePolicy.FailFast,
int replayBufferCapacity = 1024)
{
// The per-subscriber overflow policy now lives in the session's
// SessionEventDistributor, so the session must share the same metrics sink and
@@ -373,6 +587,8 @@ public sealed class EventStreamServiceTests
{
QueueCapacity = queueCapacity,
BackpressurePolicy = backpressurePolicy,
ReplayBufferCapacity = replayBufferCapacity,
ReplayRetentionSeconds = 0,
},
NullLogger<SessionEventDistributor>.Instance,
TimeProvider.System,
@@ -513,6 +729,13 @@ public sealed class EventStreamServiceTests
/// <summary>Gets or sets whether to complete the event stream after configured events are yielded.</summary>
public bool CompleteAfterConfiguredEvents { get; set; }
/// <summary>
/// Optional live channel source. When set, the worker drains the static
/// <see cref="Events"/> first, then streams from this channel until it completes,
/// letting a test feed events on demand (e.g. to exercise replay→live handoff).
/// </summary>
public System.Threading.Channels.Channel<WorkerEvent>? LiveEvents { get; init; }
/// <summary>Gets or sets an optional exception to throw as a terminal event stream fault.</summary>
public Exception? TerminalException { get; init; }
@@ -558,6 +781,18 @@ public sealed class EventStreamServiceTests
throw TerminalException;
}
if (LiveEvents is not null)
{
await foreach (WorkerEvent liveEvent in LiveEvents.Reader
.ReadAllAsync(cancellationToken)
.ConfigureAwait(false))
{
yield return liveEvent;
}
yield break;
}
if (CompleteAfterConfiguredEvents)
{
yield break;
@@ -572,6 +572,110 @@ public sealed class SessionEventDistributorTests
"isOnlySubscriber must be true for a lone external subscriber in single-subscriber mode.");
}
[Fact]
public async Task RegisterWithReplay_WithinRetainedWindow_ReturnsNewerEvents_NoGap_ThenLive()
{
Channel<MxEvent> source = Channel.CreateUnbounded<MxEvent>();
await using SessionEventDistributor distributor = CreateDistributor(
source.Reader,
replayBufferCapacity: 10,
replayRetentionSeconds: 0);
await distributor.StartAsync(CancellationToken.None);
// A primer subscriber forces the pump to retain events 1..5 deterministically.
using IEventSubscriberLease primer = distributor.Register();
for (ulong sequence = 1; sequence <= 5; sequence++)
{
source.Writer.TryWrite(Event(sequence));
_ = await ReadOneAsync(primer.Reader);
}
// Resume after sequence 2: retained window [1..5] still covers it — no gap, replay 3..5.
using IEventSubscriberLease resume = distributor.RegisterWithReplay(
2,
out IReadOnlyList<MxEvent> replay,
out bool gap,
out ulong oldestAvailable,
out ulong liveResume);
Assert.False(gap);
Assert.Equal(new ulong[] { 3, 4, 5 }, replay.Select(e => e.WorkerSequence));
Assert.Equal(5ul, liveResume);
Assert.Equal(1ul, oldestAvailable);
// A subsequent live event flows to the resumed subscriber's channel.
source.Writer.TryWrite(Event(6));
MxEvent live = await ReadOneAsync(resume.Reader);
Assert.Equal(6ul, live.WorkerSequence);
}
[Fact]
public async Task RegisterWithReplay_BelowOldestRetained_ReportsGap_AndOldestAvailable()
{
Channel<MxEvent> source = Channel.CreateUnbounded<MxEvent>();
await using SessionEventDistributor distributor = CreateDistributor(
source.Reader,
replayBufferCapacity: 3,
replayRetentionSeconds: 0);
await distributor.StartAsync(CancellationToken.None);
using IEventSubscriberLease primer = distributor.Register();
for (ulong sequence = 1; sequence <= 5; sequence++)
{
source.Writer.TryWrite(Event(sequence));
_ = await ReadOneAsync(primer.Reader);
}
// Capacity 3 retains 3,4,5; events 1,2 were evicted. Resume after 0 => gap, oldest=3.
using IEventSubscriberLease resume = distributor.RegisterWithReplay(
0,
out IReadOnlyList<MxEvent> replay,
out bool gap,
out ulong oldestAvailable,
out ulong liveResume);
Assert.True(gap);
Assert.Equal(3ul, oldestAvailable);
Assert.Equal(new ulong[] { 3, 4, 5 }, replay.Select(e => e.WorkerSequence));
Assert.Equal(5ul, liveResume);
}
[Fact]
public async Task RegisterWithReplay_NothingRetainedNewer_LiveResumeEqualsAfterSequence_NoGap()
{
Channel<MxEvent> source = Channel.CreateUnbounded<MxEvent>();
await using SessionEventDistributor distributor = CreateDistributor(
source.Reader,
replayBufferCapacity: 10,
replayRetentionSeconds: 0);
await distributor.StartAsync(CancellationToken.None);
using IEventSubscriberLease primer = distributor.Register();
for (ulong sequence = 1; sequence <= 3; sequence++)
{
source.Writer.TryWrite(Event(sequence));
_ = await ReadOneAsync(primer.Reader);
}
// Resume after 3 (newest retained): nothing newer, fully caught up — no gap, empty
// replay, and the live filter resumes after the requested watermark unchanged.
using IEventSubscriberLease resume = distributor.RegisterWithReplay(
3,
out IReadOnlyList<MxEvent> replay,
out bool gap,
out ulong oldestAvailable,
out ulong liveResume);
Assert.False(gap);
Assert.Empty(replay);
Assert.Equal(3ul, liveResume);
Assert.Equal(1ul, oldestAvailable);
source.Writer.TryWrite(Event(4));
MxEvent live = await ReadOneAsync(resume.Reader);
Assert.Equal(4ul, live.WorkerSequence);
}
private static async Task DrainUntilFaultAsync(ChannelReader<MxEvent> reader)
{
// Drains any buffered events, then surfaces the channel's completion fault (if any)