fix: resolve code-review findings (locally verified)

Server-054/055/056, Contracts-020/021/022, Tests-036/038/039,
IntegrationTests-030/031/032 (+033 deferred to live rig),
Client.Dotnet-026/028/029 (+027 won't-fix), Client.Go-030..034,
Client.Python-032..036, Client.Rust-033..038.

Key fix: SessionEventDistributor orphaned a subscriber that registered after
the pump completed but before disposal (Server-056) -> register paths now
complete late registrants under _lifecycleLock; regression test added. The
racy dashboard-mirror gRPC test made deterministic (Tests-039).

Verified green locally: gateway Tests targeted classes (GatewaySession,
SessionEventDistributor, GatewayOptionsValidator, ProtobufContractRoundTrip,
GatewaySessionDashboardMirror) + dotnet/go/python/rust client suites.
This commit is contained in:
Joseph Doherty
2026-06-17 05:23:14 -04:00
parent 25d04ec37e
commit 6b5fe6aa82
37 changed files with 1049 additions and 211 deletions
@@ -23596,8 +23596,7 @@ namespace ZB.MOM.WW.MxGateway.Contracts.Proto {
/// stream; it is ALWAYS unset on events in DrainEventsReply (the diagnostic
/// drain path never emits the sentinel).
/// Additive (proto3): existing clients that ignore this field continue to
/// deserialize the stream unchanged. (Reconnect/replay logic is Task 12; this
/// is the contract surface only.)
/// deserialize the stream unchanged.
/// </summary>
[global::System.Diagnostics.DebuggerNonUserCodeAttribute]
[global::System.CodeDom.Compiler.GeneratedCode("protoc", null)]
@@ -729,8 +729,7 @@ message MxEvent {
// stream; it is ALWAYS unset on events in DrainEventsReply (the diagnostic
// drain path never emits the sentinel).
// Additive (proto3): existing clients that ignore this field continue to
// deserialize the stream unchanged. (Reconnect/replay logic is Task 12; this
// is the contract surface only.)
// deserialize the stream unchanged.
optional ReplayGap replay_gap = 14;
oneof body {
@@ -669,11 +669,12 @@ public sealed class WorkerLiveMxAccessSmokeTests(ITestOutputHelper output)
Assert.NotEqual(0, userToIdReply.ArchestraUserToId.UserId);
}
// Suspend / Activate against the advised item. The dev-rig TestInt item class
// may not be suspendable (MXAccess returns 0x80070057 / E_INVALIDARG for a
// wrong item class — see B8 notes). That is MXAccess parity: assert the reply
// kind and a non-INVALID_REQUEST status, surface the HResult and MxStatusProxy
// for the record, and do NOT treat a provider-side rejection as a test failure.
// Suspend / Activate against the added-but-not-advised item (no Advise was issued
// between AddItem and this call). The dev-rig TestInt item class may not be
// suspendable (MXAccess returns 0x80070057 / E_INVALIDARG for a wrong item class
// — see B8 notes). That is MXAccess parity: assert the reply kind and a
// non-INVALID_REQUEST status, surface the HResult and MxStatusProxy for the
// record, and do NOT treat a provider-side rejection as a test failure.
MxCommandReply suspendReply = await fixture.Service.Invoke(
CreateSuspendRequest(sessionId, serverHandle, itemHandle),
new TestServerCallContext()).ConfigureAwait(false);
@@ -827,8 +828,9 @@ public sealed class WorkerLiveMxAccessSmokeTests(ITestOutputHelper output)
streamCancellation.Token)
.ConfigureAwait(false);
}
catch (TimeoutException)
catch (TimeoutException ex)
{
output.WriteLine($"B8: sample-bearing batch predicate timed out: {ex.Message}");
bufferedBatch = null;
}
@@ -25,6 +25,12 @@ public sealed class GatewaySession
private readonly TimeSpan _detachGrace;
private readonly TimeSpan _workerReadyWaitTimeout;
private DateTimeOffset? _detachedAtUtc;
// True once at least one external subscriber attached SUCCESSFULLY. Detach-grace's
// "last subscriber dropped" stamp (see DetachEventSubscriber) is gated on this so a
// FAILED first attach — which still runs the rollback DetachEventSubscriber from the
// attach catch path — does not push a never-subscribed session into the grace window
// (Server-055).
private bool _everHadEventSubscriber;
private SessionEventDistributor? _eventDistributor;
private bool _eventDistributorStarted;
private bool _dashboardMirrorStarted;
@@ -842,6 +848,7 @@ public sealed class GatewaySession
try
{
IEventSubscriberLease distributorLease = StartDistributorAndRegister();
MarkEventSubscriberAttached();
return new EventSubscriberLease(this, distributorLease);
}
catch
@@ -906,6 +913,7 @@ public sealed class GatewaySession
out ulong oldestAvailableSequence,
out ulong liveResumeSequence);
MarkEventSubscriberAttached();
return new EventSubscriberReplayAttachment(
new EventSubscriberLease(this, distributorLease),
replayedEvents,
@@ -920,6 +928,17 @@ public sealed class GatewaySession
}
}
// Records that an external subscriber attached successfully. Gates the detach-grace
// "last subscriber dropped" stamp so a FAILED first attach (which still rolls back via
// DetachEventSubscriber) never pushes a never-subscribed session into grace (Server-055).
private void MarkEventSubscriberAttached()
{
lock (_syncRoot)
{
_everHadEventSubscriber = true;
}
}
/// <summary>
/// Invokes a worker command synchronously and returns the reply.
/// </summary>
@@ -1862,7 +1881,12 @@ public sealed class GatewaySession
// Closing/Closed/Faulted there is nothing to retain. This is the detach→grace-start
// transition; it shares _syncRoot with the reattach→grace-cancel write above and the
// sweeper's IsDetachGraceExpired read, so the three serialize.
if (_detachGrace > TimeSpan.Zero
// Only stamp a detach that mirrors a prior SUCCESSFUL attach. The attach catch path
// calls this same method to roll back a reserved slot when the FIRST attach failed
// before any subscriber registered; that never-subscribed session must not enter the
// grace window (Server-055).
if (_everHadEventSubscriber
&& _detachGrace > TimeSpan.Zero
&& _activeEventSubscriberCount == 0
&& _state is not (SessionState.Closing or SessionState.Closed or SessionState.Faulted))
{
@@ -116,6 +116,17 @@ public sealed class SessionEventDistributor : IAsyncDisposable
private bool _started;
private bool _disposed;
// Set once the pump has run its final CompleteAllSubscribers sweep — the event source
// completed or faulted and the pump exited. Guarded by _lifecycleLock together with the
// subscriber add. A subscriber that registers AFTER this point but BEFORE DisposeAsync
// (the source ended but the session is not yet torn down) would otherwise be added with a
// channel the now-exited pump never completes, hanging its reader forever. The register
// paths complete such a late registrant's channel immediately with the same terminal
// state. _completionError carries the terminal exception (source fault) or null (graceful
// source completion), mirroring what the final CompleteAllSubscribers passed.
private bool _completed;
private Exception? _completionError;
/// <summary>
/// Initializes a per-session event distributor.
/// </summary>
@@ -304,6 +315,16 @@ public sealed class SessionEventDistributor : IAsyncDisposable
{
ObjectDisposedException.ThrowIf(_disposed, this);
_subscribers[subscriber.Id] = subscriber;
// Close the register-after-pump-completion window: if the pump already ran its
// final CompleteAllSubscribers (source completed/faulted) but the distributor is
// not yet disposed, no further completion sweep will run, so complete this late
// registrant's channel now with the same terminal state instead of leaving its
// reader hanging.
if (_completed)
{
subscriber.Channel.Writer.TryComplete(_completionError);
}
}
return new SubscriberLease(this, subscriber);
@@ -450,6 +471,14 @@ public sealed class SessionEventDistributor : IAsyncDisposable
{
ObjectDisposedException.ThrowIf(_disposed, this);
_subscribers[id] = subscriber;
// Same register-after-pump-completion guard as Register: a resume that races in
// after the source already ended still gets its retained replay batch (snapshot
// above), but its live channel must be completed now since the pump is gone.
if (_completed)
{
subscriber.Channel.Writer.TryComplete(_completionError);
}
}
}
@@ -628,9 +657,21 @@ public sealed class SessionEventDistributor : IAsyncDisposable
private void CompleteAllSubscribers(Exception? error)
{
foreach (Subscriber subscriber in _subscribers.Values)
// Record the terminal state AND complete the current subscribers under _lifecycleLock
// so this serializes with the subscriber-add in Register/RegisterWithReplay: a
// subscriber added before this runs is in the map and completed by the loop; one that
// races in afterward sees _completed and completes its own channel in the register
// path. Exactly one of the two completes each subscriber. TryComplete is non-blocking
// and (channels use AllowSynchronousContinuations=false) runs no continuation inline,
// so holding the lock across the loop cannot stall or re-enter.
lock (_lifecycleLock)
{
subscriber.Channel.Writer.TryComplete(error);
_completed = true;
_completionError = error;
foreach (Subscriber subscriber in _subscribers.Values)
{
subscriber.Channel.Writer.TryComplete(error);
}
}
}
@@ -393,4 +393,91 @@ public sealed class GatewayOptionsValidatorTests
ValidateOptionsResult result = new GatewayOptionsValidator().Validate(null, options);
Assert.True(result.Succeeded);
}
[Fact]
public void Validate_Fails_WhenDetachGraceSecondsIsNegative()
{
GatewayOptions options = CloneWithSessions(
ValidOptions(),
new SessionOptions { DetachGraceSeconds = -1 });
ValidateOptionsResult result = new GatewayOptionsValidator().Validate(null, options);
Assert.True(result.Failed);
Assert.Contains(
result.Failures!,
f => f.Contains("MxGateway:Sessions:DetachGraceSeconds"));
}
[Fact]
public void Validate_Succeeds_WhenDetachGraceSecondsIsZero()
{
GatewayOptions options = CloneWithSessions(
ValidOptions(),
new SessionOptions { DetachGraceSeconds = 0 });
ValidateOptionsResult result = new GatewayOptionsValidator().Validate(null, options);
Assert.True(result.Succeeded);
}
// -------------------------------------------------------------------------
// ReplayBufferCapacity / ReplayRetentionSeconds validation
// -------------------------------------------------------------------------
private static GatewayOptions CloneWithEvents(GatewayOptions source, EventOptions events)
=> new()
{
Authentication = source.Authentication,
Ldap = source.Ldap,
Worker = source.Worker,
Sessions = source.Sessions,
Events = events,
Dashboard = source.Dashboard,
Protocol = source.Protocol,
Alarms = source.Alarms,
Tls = source.Tls,
};
[Fact]
public void Validate_Fails_WhenReplayBufferCapacityIsNegative()
{
GatewayOptions options = CloneWithEvents(
ValidOptions(),
new EventOptions { ReplayBufferCapacity = -1 });
ValidateOptionsResult result = new GatewayOptionsValidator().Validate(null, options);
Assert.True(result.Failed);
Assert.Contains(
result.Failures!,
f => f.Contains("MxGateway:Events:ReplayBufferCapacity"));
}
[Fact]
public void Validate_Succeeds_WhenReplayBufferCapacityIsZero()
{
GatewayOptions options = CloneWithEvents(
ValidOptions(),
new EventOptions { ReplayBufferCapacity = 0 });
ValidateOptionsResult result = new GatewayOptionsValidator().Validate(null, options);
Assert.True(result.Succeeded);
}
[Fact]
public void Validate_Fails_WhenReplayRetentionSecondsIsNegative()
{
GatewayOptions options = CloneWithEvents(
ValidOptions(),
new EventOptions { ReplayRetentionSeconds = -1 });
ValidateOptionsResult result = new GatewayOptionsValidator().Validate(null, options);
Assert.True(result.Failed);
Assert.Contains(
result.Failures!,
f => f.Contains("MxGateway:Events:ReplayRetentionSeconds"));
}
[Fact]
public void Validate_Succeeds_WhenReplayRetentionSecondsIsZero()
{
GatewayOptions options = CloneWithEvents(
ValidOptions(),
new EventOptions { ReplayRetentionSeconds = 0 });
ValidateOptionsResult result = new GatewayOptionsValidator().Validate(null, options);
Assert.True(result.Succeeded);
}
}
@@ -1543,4 +1543,49 @@ public sealed class ProtobufContractRoundTripTests
Assert.Equal(AlarmProviderMode.Subtag, parsed.OnAlarmProviderModeChanged.Mode);
Assert.Equal(unchecked((int)0x80004005), parsed.OnAlarmProviderModeChanged.Hresult);
}
/// <summary>
/// Verifies that an <see cref="MxEvent"/> carrying a
/// <see cref="ReplayGap"/> (the <c>optional replay_gap = 14</c> field)
/// round-trips with both sequence fields populated, that
/// <see cref="MxEvent.BodyCase"/> remains <see cref="MxEvent.BodyOneofCase.None"/>
/// (replay_gap is not part of the body oneof), and pins the wire field
/// numbers for <c>MxEvent.replay_gap</c> (14),
/// <c>ReplayGap.requested_after_sequence</c> (1), and
/// <c>ReplayGap.oldest_available_sequence</c> (2) via the descriptor.
/// </summary>
[Fact]
public void MxEvent_RoundTripsReplayGapSentinelAndPinsFieldNumbers()
{
// ReplayGap field on MxEvent must be wire number 14.
Assert.Equal(14, MxEvent.ReplayGapFieldNumber);
// ReplayGap sub-field numbers must be pinned.
var replayGapFields = ReplayGap.Descriptor.Fields;
Assert.Equal(1, replayGapFields[ReplayGap.RequestedAfterSequenceFieldNumber].FieldNumber);
Assert.Equal("requested_after_sequence", replayGapFields[ReplayGap.RequestedAfterSequenceFieldNumber].Name);
Assert.Equal(2, replayGapFields[ReplayGap.OldestAvailableSequenceFieldNumber].FieldNumber);
Assert.Equal("oldest_available_sequence", replayGapFields[ReplayGap.OldestAvailableSequenceFieldNumber].Name);
// Build a sentinel MxEvent: replay_gap set, body oneof unset, family UNSPECIFIED.
var original = new MxEvent
{
SessionId = "session-1",
WorkerSequence = 0,
ReplayGap = new ReplayGap
{
RequestedAfterSequence = 150,
OldestAvailableSequence = 200,
},
};
var parsed = MxEvent.Parser.ParseFrom(original.ToByteArray());
Assert.Equal(original, parsed);
// replay_gap is NOT part of the body oneof — BodyCase must remain None.
Assert.Equal(MxEvent.BodyOneofCase.None, parsed.BodyCase);
Assert.NotNull(parsed.ReplayGap);
Assert.Equal(150UL, parsed.ReplayGap.RequestedAfterSequence);
Assert.Equal(200UL, parsed.ReplayGap.OldestAvailableSequence);
}
}
@@ -67,6 +67,12 @@ public sealed class GatewaySessionDashboardMirrorTests
workerClient.Events.Add(CreateWorkerEvent(2, MxEventFamily.OnDataChange));
workerClient.Events.Add(CreateWorkerEvent(3, MxEventFamily.OnWriteComplete));
workerClient.CompleteAfterConfiguredEvents = true;
// Hold the worker stream until BOTH subscribers are attached so neither misses an event.
// MarkReady registers the internal dashboard subscriber and starts the pump, which then
// blocks on the gate; the gRPC subscriber attaches below; only then is the finite stream
// released. Without this gate the pump can drain all three events before the gRPC
// subscriber registers — a register-vs-pump race that otherwise makes this test flaky.
workerClient.HoldEventsUntilReleased();
RecordingDashboardEventBroadcaster broadcaster = new();
await using GatewaySession session = CreateSession(workerClient, broadcaster);
@@ -79,13 +85,22 @@ public sealed class GatewaySessionDashboardMirrorTests
new GatewayMetrics());
List<MxEvent> grpcEvents = [];
await foreach (MxEvent mxEvent in service
.StreamEventsAsync(new StreamEventsRequest { SessionId = session.SessionId }, CancellationToken.None)
.WithCancellation(CancellationToken.None))
Task grpcReader = Task.Run(async () =>
{
grpcEvents.Add(mxEvent);
}
await foreach (MxEvent mxEvent in service
.StreamEventsAsync(new StreamEventsRequest { SessionId = session.SessionId }, CancellationToken.None)
.WithCancellation(CancellationToken.None))
{
grpcEvents.Add(mxEvent);
}
});
// The gRPC subscriber counts against ActiveEventSubscriberCount (the internal dashboard
// mirror does not), so count == 1 confirms it has attached. Only then release the stream.
await WaitUntilAsync(() => session.ActiveEventSubscriberCount == 1);
workerClient.ReleaseEvents();
await grpcReader.WaitAsync(TestTimeout);
await WaitUntilAsync(() => broadcaster.Captures.Count == 3);
Assert.Equal([1UL, 2UL, 3UL], grpcEvents.Select(mxEvent => mxEvent.WorkerSequence).ToArray());
@@ -280,6 +295,24 @@ public sealed class GatewaySessionDashboardMirrorTests
public bool CompleteAfterConfiguredEvents { get; set; }
// Gate that holds the event stream before it yields anything. Released by default, so
// ungated tests are unaffected. HoldEventsUntilReleased() makes ReadEventsAsync block
// until ReleaseEvents(), letting a test attach every subscriber before a finite,
// fast-completing stream drains (avoids a register-vs-pump race).
private TaskCompletionSource _releaseGate = CreateReleasedGate();
private static TaskCompletionSource CreateReleasedGate()
{
TaskCompletionSource gate = new(TaskCreationOptions.RunContinuationsAsynchronously);
gate.SetResult();
return gate;
}
public void HoldEventsUntilReleased() =>
_releaseGate = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously);
public void ReleaseEvents() => _releaseGate.TrySetResult();
public string SessionId { get; } = "session-dashboard-mirror";
public int? ProcessId { get; } = 1234;
@@ -298,6 +331,9 @@ public sealed class GatewaySessionDashboardMirrorTests
public async IAsyncEnumerable<WorkerEvent> ReadEventsAsync(
[EnumeratorCancellation] CancellationToken cancellationToken)
{
// Block before yielding any event until released (ungated by default).
await _releaseGate.Task.WaitAsync(cancellationToken).ConfigureAwait(false);
foreach (WorkerEvent workerEvent in Events)
{
cancellationToken.ThrowIfCancellationRequested();
@@ -545,6 +545,43 @@ public sealed class GatewaySessionTests
Assert.False(session.IsDetachGraceExpired(clock.GetUtcNow()));
}
/// <summary>
/// Server-055 regression. A FAILED first attach (the distributor never registered a
/// subscriber) must NOT enter the detach-grace window: the catch path's
/// <c>DetachEventSubscriber</c> rolls the reserved slot back to 0 but must not stamp
/// <c>DetachedAtUtc</c>, because the "last subscriber dropped" semantics only apply once
/// a subscriber was successfully registered. A freshly-Ready session whose first attach
/// failed must therefore stay out of grace and never become sweep-eligible on that basis.
/// </summary>
[Fact]
public async Task DetachGrace_FailedFirstAttach_DoesNotEnterGrace()
{
FakeTimeProvider clock = new(DateTimeOffset.UtcNow);
FakeWorkerClient workerClient = new();
// QueueCapacity = 0 makes the distributor constructor throw ArgumentOutOfRangeException
// inside StartDistributorAndRegister, so the very first AttachEventSubscriber fails after
// it reserved a slot — exercising the catch → DetachEventSubscriber rollback path.
await using GatewaySession session = CreateReadySessionWithDetachGrace(
workerClient,
clock,
detachGrace: TimeSpan.FromSeconds(30),
queueCapacity: 0);
Assert.ThrowsAny<ArgumentException>(
() => session.AttachEventSubscriber(maxSubscribers: 1));
// The reserved slot was rolled back, but no successful subscriber ever existed, so the
// session must NOT have entered detach-grace.
Assert.Equal(SessionState.Ready, session.State);
Assert.Equal(0, session.ActiveEventSubscriberCount);
Assert.Null(session.DetachedAtUtc);
// And it must never become detach-grace-eligible no matter how far the clock advances.
clock.Advance(TimeSpan.FromHours(1));
Assert.False(session.IsDetachGraceExpired(clock.GetUtcNow()));
}
/// <summary>
/// Task 11. The gateway-owned internal dashboard subscriber must NOT keep a session out
/// of detach-grace: with only the dashboard mirror attached (and no external gRPC
@@ -618,7 +655,8 @@ public sealed class GatewaySessionTests
IWorkerClient workerClient,
TimeProvider timeProvider,
TimeSpan detachGrace,
IDashboardEventBroadcaster? dashboardBroadcaster = null)
IDashboardEventBroadcaster? dashboardBroadcaster = null,
int queueCapacity = 8)
{
GatewaySession session = new(
sessionId: "session-test-detach-grace",
@@ -636,7 +674,7 @@ public sealed class GatewaySessionTests
openedAt: timeProvider.GetUtcNow(),
eventStreaming: new SessionEventStreaming(
new MxAccessGrpcMapper(),
new EventOptions { QueueCapacity = 8 },
new EventOptions { QueueCapacity = queueCapacity },
NullLogger<SessionEventDistributor>.Instance,
timeProvider,
new GatewayMetrics(),
@@ -702,16 +702,71 @@ public sealed class SessionEventDistributorTests
private static async Task DrainUntilFaultAsync(ChannelReader<MxEvent> reader)
{
// Drains any buffered events, then surfaces the channel's completion fault (if any)
// by awaiting the final read past the buffered tail.
// by awaiting the final WaitToReadAsync past the buffered tail.
// If WaitToReadAsync returns false (graceful completion rather than a fault),
// await Completion to surface any fault stored there, then Assert.Fail so the
// helper does not spin forever on a channel that completes without an exception.
while (true)
{
await reader.WaitToReadAsync().AsTask().WaitAsync(ReadTimeout);
bool hasMore = await reader.WaitToReadAsync().AsTask().WaitAsync(ReadTimeout);
if (!hasMore)
{
// Graceful completion — propagate any stored exception, then fail.
await reader.Completion;
Assert.Fail("DrainUntilFaultAsync: channel completed gracefully (no fault).");
return;
}
while (reader.TryRead(out _))
{
}
}
}
/// <summary>
/// Regression: a subscriber that registers in the window AFTER the pump has completed
/// (its event source finished) but BEFORE the distributor is disposed must have its
/// channel completed immediately, not left open forever. The pump has already run its
/// final <c>CompleteAllSubscribers</c> sweep and exited, so without the
/// register-after-completion guard the late subscriber's reader hangs indefinitely.
/// This was observed as an order-dependent hang in
/// <c>GatewaySessionDashboardMirrorTests</c>, where a gRPC subscriber attached after a
/// fast-completing worker stream had already drained.
/// </summary>
[Fact]
public async Task Register_AfterSourceCompletes_CompletesLateSubscriberInsteadOfHanging()
{
Channel<MxEvent> source = Channel.CreateUnbounded<MxEvent>();
await using SessionEventDistributor distributor = CreateDistributor(source.Reader);
await distributor.StartAsync(CancellationToken.None);
// An early subscriber lets us observe when the pump's final completion sweep has run.
using IEventSubscriberLease early = distributor.Register();
// Complete the source: the pump drains it, runs CompleteAllSubscribers, and exits.
source.Writer.Complete();
// Draining the early subscriber to completion proves the pump finished its sweep — so
// a subscriber registering now is unambiguously in the register-after-completion window.
using (CancellationTokenSource earlyCts = new(ReadTimeout))
{
await foreach (MxEvent _ in early.Reader.ReadAllAsync(earlyCts.Token))
{
}
}
// Register AFTER the pump has completed. The channel must be completed immediately; the
// bounded read below must end rather than hang (the ReadTimeout converts a regression
// into a fast OperationCanceledException failure instead of an indefinite hang).
using IEventSubscriberLease late = distributor.Register();
using CancellationTokenSource lateCts = new(ReadTimeout);
await foreach (MxEvent _ in late.Reader.ReadAllAsync(lateCts.Token))
{
}
Assert.False(lateCts.IsCancellationRequested);
}
private static SessionEventDistributor CreateDistributor(ChannelReader<MxEvent> source)
=> CreateDistributor(source, replayBufferCapacity: 1024, replayRetentionSeconds: 300);