fix: resolve code-review findings (locally verified)

Server-054/055/056, Contracts-020/021/022, Tests-036/038/039,
IntegrationTests-030/031/032 (+033 deferred to live rig),
Client.Dotnet-026/028/029 (+027 won't-fix), Client.Go-030..034,
Client.Python-032..036, Client.Rust-033..038.

Key fix: SessionEventDistributor orphaned a subscriber that registered after
the pump completed but before disposal (Server-056) -> register paths now
complete late registrants under _lifecycleLock; regression test added. The
racy dashboard-mirror gRPC test made deterministic (Tests-039).

Verified green locally: gateway Tests targeted classes (GatewaySession,
SessionEventDistributor, GatewayOptionsValidator, ProtobufContractRoundTrip,
GatewaySessionDashboardMirror) + dotnet/go/python/rust client suites.
This commit is contained in:
Joseph Doherty
2026-06-17 05:23:14 -04:00
parent 25d04ec37e
commit 6b5fe6aa82
37 changed files with 1049 additions and 211 deletions
@@ -393,4 +393,91 @@ public sealed class GatewayOptionsValidatorTests
ValidateOptionsResult result = new GatewayOptionsValidator().Validate(null, options);
Assert.True(result.Succeeded);
}
[Fact]
public void Validate_Fails_WhenDetachGraceSecondsIsNegative()
{
GatewayOptions options = CloneWithSessions(
ValidOptions(),
new SessionOptions { DetachGraceSeconds = -1 });
ValidateOptionsResult result = new GatewayOptionsValidator().Validate(null, options);
Assert.True(result.Failed);
Assert.Contains(
result.Failures!,
f => f.Contains("MxGateway:Sessions:DetachGraceSeconds"));
}
[Fact]
public void Validate_Succeeds_WhenDetachGraceSecondsIsZero()
{
GatewayOptions options = CloneWithSessions(
ValidOptions(),
new SessionOptions { DetachGraceSeconds = 0 });
ValidateOptionsResult result = new GatewayOptionsValidator().Validate(null, options);
Assert.True(result.Succeeded);
}
// -------------------------------------------------------------------------
// ReplayBufferCapacity / ReplayRetentionSeconds validation
// -------------------------------------------------------------------------
private static GatewayOptions CloneWithEvents(GatewayOptions source, EventOptions events)
=> new()
{
Authentication = source.Authentication,
Ldap = source.Ldap,
Worker = source.Worker,
Sessions = source.Sessions,
Events = events,
Dashboard = source.Dashboard,
Protocol = source.Protocol,
Alarms = source.Alarms,
Tls = source.Tls,
};
[Fact]
public void Validate_Fails_WhenReplayBufferCapacityIsNegative()
{
GatewayOptions options = CloneWithEvents(
ValidOptions(),
new EventOptions { ReplayBufferCapacity = -1 });
ValidateOptionsResult result = new GatewayOptionsValidator().Validate(null, options);
Assert.True(result.Failed);
Assert.Contains(
result.Failures!,
f => f.Contains("MxGateway:Events:ReplayBufferCapacity"));
}
[Fact]
public void Validate_Succeeds_WhenReplayBufferCapacityIsZero()
{
GatewayOptions options = CloneWithEvents(
ValidOptions(),
new EventOptions { ReplayBufferCapacity = 0 });
ValidateOptionsResult result = new GatewayOptionsValidator().Validate(null, options);
Assert.True(result.Succeeded);
}
[Fact]
public void Validate_Fails_WhenReplayRetentionSecondsIsNegative()
{
GatewayOptions options = CloneWithEvents(
ValidOptions(),
new EventOptions { ReplayRetentionSeconds = -1 });
ValidateOptionsResult result = new GatewayOptionsValidator().Validate(null, options);
Assert.True(result.Failed);
Assert.Contains(
result.Failures!,
f => f.Contains("MxGateway:Events:ReplayRetentionSeconds"));
}
[Fact]
public void Validate_Succeeds_WhenReplayRetentionSecondsIsZero()
{
GatewayOptions options = CloneWithEvents(
ValidOptions(),
new EventOptions { ReplayRetentionSeconds = 0 });
ValidateOptionsResult result = new GatewayOptionsValidator().Validate(null, options);
Assert.True(result.Succeeded);
}
}
@@ -1543,4 +1543,49 @@ public sealed class ProtobufContractRoundTripTests
Assert.Equal(AlarmProviderMode.Subtag, parsed.OnAlarmProviderModeChanged.Mode);
Assert.Equal(unchecked((int)0x80004005), parsed.OnAlarmProviderModeChanged.Hresult);
}
/// <summary>
/// Verifies that an <see cref="MxEvent"/> carrying a
/// <see cref="ReplayGap"/> (the <c>optional replay_gap = 14</c> field)
/// round-trips with both sequence fields populated, that
/// <see cref="MxEvent.BodyCase"/> remains <see cref="MxEvent.BodyOneofCase.None"/>
/// (replay_gap is not part of the body oneof), and pins the wire field
/// numbers for <c>MxEvent.replay_gap</c> (14),
/// <c>ReplayGap.requested_after_sequence</c> (1), and
/// <c>ReplayGap.oldest_available_sequence</c> (2) via the descriptor.
/// </summary>
[Fact]
public void MxEvent_RoundTripsReplayGapSentinelAndPinsFieldNumbers()
{
// ReplayGap field on MxEvent must be wire number 14.
Assert.Equal(14, MxEvent.ReplayGapFieldNumber);
// ReplayGap sub-field numbers must be pinned.
var replayGapFields = ReplayGap.Descriptor.Fields;
Assert.Equal(1, replayGapFields[ReplayGap.RequestedAfterSequenceFieldNumber].FieldNumber);
Assert.Equal("requested_after_sequence", replayGapFields[ReplayGap.RequestedAfterSequenceFieldNumber].Name);
Assert.Equal(2, replayGapFields[ReplayGap.OldestAvailableSequenceFieldNumber].FieldNumber);
Assert.Equal("oldest_available_sequence", replayGapFields[ReplayGap.OldestAvailableSequenceFieldNumber].Name);
// Build a sentinel MxEvent: replay_gap set, body oneof unset, family UNSPECIFIED.
var original = new MxEvent
{
SessionId = "session-1",
WorkerSequence = 0,
ReplayGap = new ReplayGap
{
RequestedAfterSequence = 150,
OldestAvailableSequence = 200,
},
};
var parsed = MxEvent.Parser.ParseFrom(original.ToByteArray());
Assert.Equal(original, parsed);
// replay_gap is NOT part of the body oneof — BodyCase must remain None.
Assert.Equal(MxEvent.BodyOneofCase.None, parsed.BodyCase);
Assert.NotNull(parsed.ReplayGap);
Assert.Equal(150UL, parsed.ReplayGap.RequestedAfterSequence);
Assert.Equal(200UL, parsed.ReplayGap.OldestAvailableSequence);
}
}
@@ -67,6 +67,12 @@ public sealed class GatewaySessionDashboardMirrorTests
workerClient.Events.Add(CreateWorkerEvent(2, MxEventFamily.OnDataChange));
workerClient.Events.Add(CreateWorkerEvent(3, MxEventFamily.OnWriteComplete));
workerClient.CompleteAfterConfiguredEvents = true;
// Hold the worker stream until BOTH subscribers are attached so neither misses an event.
// MarkReady registers the internal dashboard subscriber and starts the pump, which then
// blocks on the gate; the gRPC subscriber attaches below; only then is the finite stream
// released. Without this gate the pump can drain all three events before the gRPC
// subscriber registers — a register-vs-pump race that otherwise makes this test flaky.
workerClient.HoldEventsUntilReleased();
RecordingDashboardEventBroadcaster broadcaster = new();
await using GatewaySession session = CreateSession(workerClient, broadcaster);
@@ -79,13 +85,22 @@ public sealed class GatewaySessionDashboardMirrorTests
new GatewayMetrics());
List<MxEvent> grpcEvents = [];
await foreach (MxEvent mxEvent in service
.StreamEventsAsync(new StreamEventsRequest { SessionId = session.SessionId }, CancellationToken.None)
.WithCancellation(CancellationToken.None))
Task grpcReader = Task.Run(async () =>
{
grpcEvents.Add(mxEvent);
}
await foreach (MxEvent mxEvent in service
.StreamEventsAsync(new StreamEventsRequest { SessionId = session.SessionId }, CancellationToken.None)
.WithCancellation(CancellationToken.None))
{
grpcEvents.Add(mxEvent);
}
});
// The gRPC subscriber counts against ActiveEventSubscriberCount (the internal dashboard
// mirror does not), so count == 1 confirms it has attached. Only then release the stream.
await WaitUntilAsync(() => session.ActiveEventSubscriberCount == 1);
workerClient.ReleaseEvents();
await grpcReader.WaitAsync(TestTimeout);
await WaitUntilAsync(() => broadcaster.Captures.Count == 3);
Assert.Equal([1UL, 2UL, 3UL], grpcEvents.Select(mxEvent => mxEvent.WorkerSequence).ToArray());
@@ -280,6 +295,24 @@ public sealed class GatewaySessionDashboardMirrorTests
public bool CompleteAfterConfiguredEvents { get; set; }
// Gate that holds the event stream before it yields anything. Released by default, so
// ungated tests are unaffected. HoldEventsUntilReleased() makes ReadEventsAsync block
// until ReleaseEvents(), letting a test attach every subscriber before a finite,
// fast-completing stream drains (avoids a register-vs-pump race).
private TaskCompletionSource _releaseGate = CreateReleasedGate();
private static TaskCompletionSource CreateReleasedGate()
{
TaskCompletionSource gate = new(TaskCreationOptions.RunContinuationsAsynchronously);
gate.SetResult();
return gate;
}
public void HoldEventsUntilReleased() =>
_releaseGate = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously);
public void ReleaseEvents() => _releaseGate.TrySetResult();
public string SessionId { get; } = "session-dashboard-mirror";
public int? ProcessId { get; } = 1234;
@@ -298,6 +331,9 @@ public sealed class GatewaySessionDashboardMirrorTests
public async IAsyncEnumerable<WorkerEvent> ReadEventsAsync(
[EnumeratorCancellation] CancellationToken cancellationToken)
{
// Block before yielding any event until released (ungated by default).
await _releaseGate.Task.WaitAsync(cancellationToken).ConfigureAwait(false);
foreach (WorkerEvent workerEvent in Events)
{
cancellationToken.ThrowIfCancellationRequested();
@@ -545,6 +545,43 @@ public sealed class GatewaySessionTests
Assert.False(session.IsDetachGraceExpired(clock.GetUtcNow()));
}
/// <summary>
/// Server-055 regression. A FAILED first attach (the distributor never registered a
/// subscriber) must NOT enter the detach-grace window: the catch path's
/// <c>DetachEventSubscriber</c> rolls the reserved slot back to 0 but must not stamp
/// <c>DetachedAtUtc</c>, because the "last subscriber dropped" semantics only apply once
/// a subscriber was successfully registered. A freshly-Ready session whose first attach
/// failed must therefore stay out of grace and never become sweep-eligible on that basis.
/// </summary>
[Fact]
public async Task DetachGrace_FailedFirstAttach_DoesNotEnterGrace()
{
FakeTimeProvider clock = new(DateTimeOffset.UtcNow);
FakeWorkerClient workerClient = new();
// QueueCapacity = 0 makes the distributor constructor throw ArgumentOutOfRangeException
// inside StartDistributorAndRegister, so the very first AttachEventSubscriber fails after
// it reserved a slot — exercising the catch → DetachEventSubscriber rollback path.
await using GatewaySession session = CreateReadySessionWithDetachGrace(
workerClient,
clock,
detachGrace: TimeSpan.FromSeconds(30),
queueCapacity: 0);
Assert.ThrowsAny<ArgumentException>(
() => session.AttachEventSubscriber(maxSubscribers: 1));
// The reserved slot was rolled back, but no successful subscriber ever existed, so the
// session must NOT have entered detach-grace.
Assert.Equal(SessionState.Ready, session.State);
Assert.Equal(0, session.ActiveEventSubscriberCount);
Assert.Null(session.DetachedAtUtc);
// And it must never become detach-grace-eligible no matter how far the clock advances.
clock.Advance(TimeSpan.FromHours(1));
Assert.False(session.IsDetachGraceExpired(clock.GetUtcNow()));
}
/// <summary>
/// Task 11. The gateway-owned internal dashboard subscriber must NOT keep a session out
/// of detach-grace: with only the dashboard mirror attached (and no external gRPC
@@ -618,7 +655,8 @@ public sealed class GatewaySessionTests
IWorkerClient workerClient,
TimeProvider timeProvider,
TimeSpan detachGrace,
IDashboardEventBroadcaster? dashboardBroadcaster = null)
IDashboardEventBroadcaster? dashboardBroadcaster = null,
int queueCapacity = 8)
{
GatewaySession session = new(
sessionId: "session-test-detach-grace",
@@ -636,7 +674,7 @@ public sealed class GatewaySessionTests
openedAt: timeProvider.GetUtcNow(),
eventStreaming: new SessionEventStreaming(
new MxAccessGrpcMapper(),
new EventOptions { QueueCapacity = 8 },
new EventOptions { QueueCapacity = queueCapacity },
NullLogger<SessionEventDistributor>.Instance,
timeProvider,
new GatewayMetrics(),
@@ -702,16 +702,71 @@ public sealed class SessionEventDistributorTests
private static async Task DrainUntilFaultAsync(ChannelReader<MxEvent> reader)
{
// Drains any buffered events, then surfaces the channel's completion fault (if any)
// by awaiting the final read past the buffered tail.
// by awaiting the final WaitToReadAsync past the buffered tail.
// If WaitToReadAsync returns false (graceful completion rather than a fault),
// await Completion to surface any fault stored there, then Assert.Fail so the
// helper does not spin forever on a channel that completes without an exception.
while (true)
{
await reader.WaitToReadAsync().AsTask().WaitAsync(ReadTimeout);
bool hasMore = await reader.WaitToReadAsync().AsTask().WaitAsync(ReadTimeout);
if (!hasMore)
{
// Graceful completion — propagate any stored exception, then fail.
await reader.Completion;
Assert.Fail("DrainUntilFaultAsync: channel completed gracefully (no fault).");
return;
}
while (reader.TryRead(out _))
{
}
}
}
/// <summary>
/// Regression: a subscriber that registers in the window AFTER the pump has completed
/// (its event source finished) but BEFORE the distributor is disposed must have its
/// channel completed immediately, not left open forever. The pump has already run its
/// final <c>CompleteAllSubscribers</c> sweep and exited, so without the
/// register-after-completion guard the late subscriber's reader hangs indefinitely.
/// This was observed as an order-dependent hang in
/// <c>GatewaySessionDashboardMirrorTests</c>, where a gRPC subscriber attached after a
/// fast-completing worker stream had already drained.
/// </summary>
[Fact]
public async Task Register_AfterSourceCompletes_CompletesLateSubscriberInsteadOfHanging()
{
Channel<MxEvent> source = Channel.CreateUnbounded<MxEvent>();
await using SessionEventDistributor distributor = CreateDistributor(source.Reader);
await distributor.StartAsync(CancellationToken.None);
// An early subscriber lets us observe when the pump's final completion sweep has run.
using IEventSubscriberLease early = distributor.Register();
// Complete the source: the pump drains it, runs CompleteAllSubscribers, and exits.
source.Writer.Complete();
// Draining the early subscriber to completion proves the pump finished its sweep — so
// a subscriber registering now is unambiguously in the register-after-completion window.
using (CancellationTokenSource earlyCts = new(ReadTimeout))
{
await foreach (MxEvent _ in early.Reader.ReadAllAsync(earlyCts.Token))
{
}
}
// Register AFTER the pump has completed. The channel must be completed immediately; the
// bounded read below must end rather than hang (the ReadTimeout converts a regression
// into a fast OperationCanceledException failure instead of an indefinite hang).
using IEventSubscriberLease late = distributor.Register();
using CancellationTokenSource lateCts = new(ReadTimeout);
await foreach (MxEvent _ in late.Reader.ReadAllAsync(lateCts.Token))
{
}
Assert.False(lateCts.IsCancellationRequested);
}
private static SessionEventDistributor CreateDistributor(ChannelReader<MxEvent> source)
=> CreateDistributor(source, replayBufferCapacity: 1024, replayRetentionSeconds: 300);