Resolve Tests-027..031: flake root cause + coverage gaps
Tests-027 GatewayMetrics exposes its internal Meter; the
StreamEvents_WhenEventIsWritten_RecordsSendDuration listener
now filters by ReferenceEquals(instrument.Meter, metrics.Meter)
instead of Meter.Name, so parallel tests with their own
GatewayMetrics no longer cross-contaminate the families list.
Tests-028 FakeWorkerClient.Kill now captures LastKillReason;
SessionManager.KillWorkerAsync tests pin the reason
propagation end-to-end and cover the blank/null guard. The
DashboardSessionAdminService kill test pins the literal
dashboard-admin-kill reason.
Tests-029 Added CloseSessionAsync_BlankSessionId_ReturnsFailure to mirror
the existing KillWorkerAsync blank-id coverage.
Tests-030 DeleteAsync_WhenStoreRefuses_ReportsFriendlyError renamed and
extended to assert the dashboard-delete-key audit row with
Details = not-found-or-active. Added
DeleteAsync_BlankKeyId_ReturnsFailure.
Tests-031 DashboardSnapshotPublisher reconnect test now measures the
gap from the first throw inside the fake (firstThrowAt) to
secondSubscribeAt, isolating Task.Delay from StartAsync /
scheduling overhead.
All resolved at 2026-05-24; 512/512 gateway tests pass.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -78,6 +78,15 @@ public sealed class GatewayMetrics : IDisposable
|
||||
_meter.CreateObservableGauge("mxgateway.events.grpc_stream_queue.depth", GetGrpcEventStreamQueueDepth);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets the underlying <see cref="Meter"/> instance backing this metrics object. Exposed to tests
|
||||
/// (via <c>InternalsVisibleTo</c>) so a <see cref="MeterListener"/> can filter measurements by
|
||||
/// <see cref="object.ReferenceEquals"/> against this specific instance rather than by the
|
||||
/// process-shared <see cref="MeterName"/>, which would cross-talk between parallel tests that
|
||||
/// each build their own <see cref="GatewayMetrics"/> (Tests-027).
|
||||
/// </summary>
|
||||
internal Meter Meter => _meter;
|
||||
|
||||
/// <summary>
|
||||
/// Records that a session has been opened.
|
||||
/// </summary>
|
||||
|
||||
+41
-2
@@ -147,11 +147,20 @@ public sealed class DashboardApiKeyManagementServiceTests
|
||||
&& entry.Details == "deleted");
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Tests-030: when the admin store refuses the delete (returns <c>false</c>), the service
|
||||
/// still emits a <c>dashboard-delete-key</c> audit entry with <c>Details = "not-found-or-active"</c>
|
||||
/// because <c>AppendAuditAsync</c> runs unconditionally after the store call. A regression that
|
||||
/// moved the audit-append call inside the <c>if (deleted)</c> branch would silently drop the
|
||||
/// audit trail for refused deletes — a real audit-completeness gap. This test pins both the
|
||||
/// friendly-error response AND the unconditional audit entry.
|
||||
/// </summary>
|
||||
[Fact]
|
||||
public async Task DeleteAsync_WhenStoreRefuses_ReportsFriendlyError()
|
||||
public async Task DeleteAsync_WhenStoreRefuses_ReportsFriendlyErrorAndAudits()
|
||||
{
|
||||
FakeApiKeyAdminStore adminStore = new() { DeleteResult = false };
|
||||
DashboardApiKeyManagementService service = CreateService(adminStore);
|
||||
FakeApiKeyAuditStore auditStore = new();
|
||||
DashboardApiKeyManagementService service = CreateService(adminStore, auditStore);
|
||||
|
||||
DashboardApiKeyManagementResult result = await service.DeleteAsync(
|
||||
CreateAuthorizedUser(),
|
||||
@@ -160,6 +169,36 @@ public sealed class DashboardApiKeyManagementServiceTests
|
||||
|
||||
Assert.False(result.Succeeded);
|
||||
Assert.Contains("Revoke", result.Message, StringComparison.Ordinal);
|
||||
|
||||
ApiKeyAuditEntry auditEntry = Assert.Single(auditStore.Entries);
|
||||
Assert.Equal("dashboard-delete-key", auditEntry.EventType);
|
||||
Assert.Equal("operator01", auditEntry.KeyId);
|
||||
Assert.Equal("not-found-or-active", auditEntry.Details);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Tests-030: <see cref="DashboardApiKeyManagementService.DeleteAsync"/> calls
|
||||
/// <c>ValidateKeyId</c> after the authorisation check. A blank key id must fail with the
|
||||
/// shared "API key id is required." message before any store or audit call runs.
|
||||
/// </summary>
|
||||
[Theory]
|
||||
[InlineData("")]
|
||||
[InlineData(" ")]
|
||||
[InlineData("\t")]
|
||||
public async Task DeleteAsync_BlankKeyId_ReturnsFailure(string blankKeyId)
|
||||
{
|
||||
FakeApiKeyAdminStore adminStore = new();
|
||||
FakeApiKeyAuditStore auditStore = new();
|
||||
DashboardApiKeyManagementService service = CreateService(adminStore, auditStore);
|
||||
|
||||
DashboardApiKeyManagementResult result = await service.DeleteAsync(
|
||||
CreateAuthorizedUser(),
|
||||
blankKeyId,
|
||||
CancellationToken.None);
|
||||
|
||||
Assert.False(result.Succeeded);
|
||||
Assert.Equal(0, adminStore.DeleteCount);
|
||||
Assert.Empty(auditStore.Entries);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
|
||||
+26
-1
@@ -87,7 +87,12 @@ public sealed class DashboardSessionAdminServiceTests
|
||||
Assert.True(result.Succeeded);
|
||||
Assert.Equal(1, sessionManager.KillCount);
|
||||
Assert.Equal("session-1", sessionManager.LastKilledSessionId);
|
||||
Assert.False(string.IsNullOrWhiteSpace(sessionManager.LastKillReason));
|
||||
|
||||
// Tests-028: pin the literal reason string so a future caller-side change is a deliberate
|
||||
// test update rather than a silent drift. DashboardSessionAdminService passes a hard-coded
|
||||
// "dashboard-admin-kill" so the worker-exit metric (mxgateway.workers.killed) carries a
|
||||
// stable, machine-greppable reason tag.
|
||||
Assert.Equal("dashboard-admin-kill", sessionManager.LastKillReason);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
@@ -105,6 +110,26 @@ public sealed class DashboardSessionAdminServiceTests
|
||||
Assert.Equal(0, sessionManager.KillCount);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Tests-029: <c>CloseSessionAsync</c> has the same blank-session-id guard as
|
||||
/// <c>KillWorkerAsync</c> but previously had no parallel test. Coverage was asymmetric.
|
||||
/// A guard-removal regression on the close path would slip through.
|
||||
/// </summary>
|
||||
[Fact]
|
||||
public async Task CloseSessionAsync_BlankSessionId_ReturnsFailure()
|
||||
{
|
||||
FakeSessionManager sessionManager = new();
|
||||
DashboardSessionAdminService service = CreateService(sessionManager);
|
||||
|
||||
DashboardSessionAdminResult result = await service.CloseSessionAsync(
|
||||
CreateUser(DashboardRoles.Admin),
|
||||
" ",
|
||||
CancellationToken.None);
|
||||
|
||||
Assert.False(result.Succeeded);
|
||||
Assert.Equal(0, sessionManager.CloseCount);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void CanManage_RejectsUnauthenticatedAndViewer()
|
||||
{
|
||||
|
||||
@@ -17,6 +17,13 @@ public sealed class DashboardSnapshotPublisherTests
|
||||
/// reconnect delay and then re-open the subscription. Before the fix,
|
||||
/// the publisher exited on the first non-cancellation exception and
|
||||
/// the dashboard's snapshot stream went silent until process restart.
|
||||
///
|
||||
/// <para>Tests-031: the reconnect-gap measurement is bounded between the
|
||||
/// moment the first subscribe actually <c>throw</c>s and the moment the
|
||||
/// second subscribe begins. Measuring from <c>startedAt</c> (pre-<c>StartAsync</c>)
|
||||
/// baselined scheduling overhead into the budget and made the lower bound
|
||||
/// flaky on slow CI; recording <c>firstThrowAt</c> inside the fake removes
|
||||
/// that baseline so only the <c>Task.Delay(reconnectDelay)</c> contributes.</para>
|
||||
/// </summary>
|
||||
[Fact]
|
||||
public async Task ExecuteAsync_WhenSnapshotServiceThrowsOnce_ReconnectsAfterDelay()
|
||||
@@ -31,7 +38,6 @@ public sealed class DashboardSnapshotPublisherTests
|
||||
reconnectDelay);
|
||||
|
||||
using CancellationTokenSource cts = new();
|
||||
DateTimeOffset startedAt = DateTimeOffset.UtcNow;
|
||||
Task execute = publisher.StartAsync(cts.Token);
|
||||
await execute.WaitAsync(TestTimeout);
|
||||
|
||||
@@ -42,6 +48,8 @@ public sealed class DashboardSnapshotPublisherTests
|
||||
await WaitUntilAsync(() => snapshotService.SubscribeCount >= 2);
|
||||
await WaitUntilAsync(() => hubContext.SendCount >= 1);
|
||||
|
||||
DateTimeOffset firstThrowAt = snapshotService.FirstThrowAt
|
||||
?? throw new InvalidOperationException("First subscribe did not record a throw timestamp.");
|
||||
DateTimeOffset secondSubscribeAt = snapshotService.SecondSubscribeAt
|
||||
?? throw new InvalidOperationException("Second subscribe did not record a timestamp.");
|
||||
|
||||
@@ -52,10 +60,13 @@ public sealed class DashboardSnapshotPublisherTests
|
||||
$"Expected at least 2 subscribe calls, got {snapshotService.SubscribeCount}.");
|
||||
Assert.True(hubContext.SendCount >= 1);
|
||||
|
||||
// The gap between the throw (first subscribe) and the reconnect
|
||||
// (second subscribe) is bounded below by the reconnect delay. We
|
||||
// give a small slack (10ms) for scheduling jitter on slow CI VMs.
|
||||
TimeSpan gap = secondSubscribeAt - startedAt;
|
||||
// Tests-031: the gap is measured from the moment the first subscribe
|
||||
// actually threw (inside the fake) to the moment the second subscribe
|
||||
// began (also inside the fake). This isolates the publisher's
|
||||
// Task.Delay(reconnectDelay) — no StartAsync / scheduling overhead in
|
||||
// the baseline. The 10ms slack absorbs Task.Delay's coarse Windows
|
||||
// timer quantum (~15ms) when the underlying scheduler wakes early.
|
||||
TimeSpan gap = secondSubscribeAt - firstThrowAt;
|
||||
Assert.True(gap >= reconnectDelay - TimeSpan.FromMilliseconds(10),
|
||||
$"Expected reconnect gap >= {reconnectDelay.TotalMilliseconds}ms; got {gap.TotalMilliseconds}ms.");
|
||||
}
|
||||
@@ -100,6 +111,15 @@ public sealed class DashboardSnapshotPublisherTests
|
||||
private sealed class ThrowOnceThenYieldSnapshotService : IDashboardSnapshotService
|
||||
{
|
||||
public int SubscribeCount { get; private set; }
|
||||
|
||||
/// <summary>
|
||||
/// Tests-031: the wall-clock instant the first <c>WatchSnapshotsAsync</c> throws.
|
||||
/// The reconnect-gap assertion is measured against this timestamp (NOT the
|
||||
/// pre-<c>StartAsync</c> wall clock) so scheduling overhead is not baselined
|
||||
/// into the lower bound.
|
||||
/// </summary>
|
||||
public DateTimeOffset? FirstThrowAt { get; private set; }
|
||||
|
||||
public DateTimeOffset? SecondSubscribeAt { get; private set; }
|
||||
|
||||
public DashboardSnapshot GetSnapshot()
|
||||
@@ -118,6 +138,7 @@ public sealed class DashboardSnapshotPublisherTests
|
||||
// First call: throw after a brief yield so the publisher
|
||||
// observes us as a live producer that failed.
|
||||
await Task.Yield();
|
||||
FirstThrowAt = DateTimeOffset.UtcNow;
|
||||
throw new InvalidOperationException("simulated transient snapshot failure");
|
||||
}
|
||||
|
||||
|
||||
@@ -194,7 +194,17 @@ public sealed class MxAccessGatewayServiceTests
|
||||
Assert.Equal("session-1", sessionManager.LastReadEventsSessionId);
|
||||
}
|
||||
|
||||
/// <summary>Verifies that StreamEvents records send duration metrics when an event is written.</summary>
|
||||
/// <summary>
|
||||
/// Verifies that <c>StreamEvents</c> records the send-duration histogram per event.
|
||||
///
|
||||
/// <para>Tests-027 (concurrency flake): the listener must filter by the specific
|
||||
/// <see cref="System.Diagnostics.Metrics.Meter"/> instance owned by this test, not by the process-shared
|
||||
/// <see cref="GatewayMetrics.MeterName"/>. Otherwise a parallel test that constructs its own
|
||||
/// <see cref="GatewayMetrics"/> and records <c>mxgateway.events.stream_send.duration</c> would
|
||||
/// cross-contaminate <c>families</c> and break the equality assertion below. See the companion
|
||||
/// <see cref="StreamEvents_RecordSendDurationListener_IgnoresMeasurementsFromOtherMetersWithSameName"/>
|
||||
/// regression for the cross-talk reproduction.</para>
|
||||
/// </summary>
|
||||
[Fact]
|
||||
public async Task StreamEvents_WhenEventIsWritten_RecordsSendDuration()
|
||||
{
|
||||
@@ -203,7 +213,7 @@ public sealed class MxAccessGatewayServiceTests
|
||||
List<string> families = [];
|
||||
listener.InstrumentPublished = (instrument, meterListener) =>
|
||||
{
|
||||
if (instrument.Meter.Name == GatewayMetrics.MeterName
|
||||
if (ReferenceEquals(instrument.Meter, metrics.Meter)
|
||||
&& instrument.Name == "mxgateway.events.stream_send.duration")
|
||||
{
|
||||
meterListener.EnableMeasurementEvents(instrument);
|
||||
@@ -212,7 +222,8 @@ public sealed class MxAccessGatewayServiceTests
|
||||
listener.SetMeasurementEventCallback<double>(
|
||||
(instrument, measurement, tags, _) =>
|
||||
{
|
||||
if (instrument.Name != "mxgateway.events.stream_send.duration")
|
||||
if (!ReferenceEquals(instrument.Meter, metrics.Meter)
|
||||
|| instrument.Name != "mxgateway.events.stream_send.duration")
|
||||
{
|
||||
return;
|
||||
}
|
||||
@@ -239,6 +250,69 @@ public sealed class MxAccessGatewayServiceTests
|
||||
Assert.Equal([MxEventFamily.OnDataChange.ToString()], families);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Tests-027 regression: a <see cref="MeterListener"/> that filters by the specific
|
||||
/// <see cref="System.Diagnostics.Metrics.Meter"/> instance (via <see cref="object.ReferenceEquals"/>)
|
||||
/// must NOT observe measurements recorded on a different <see cref="GatewayMetrics"/> that shares
|
||||
/// the same <see cref="GatewayMetrics.MeterName"/>. This is the cross-talk vector that previously
|
||||
/// caused <c>StreamEvents_WhenEventIsWritten_RecordsSendDuration</c> to fail intermittently when
|
||||
/// run in parallel with another test recording the same histogram.
|
||||
/// </summary>
|
||||
[Fact]
|
||||
public async Task StreamEvents_RecordSendDurationListener_IgnoresMeasurementsFromOtherMetersWithSameName()
|
||||
{
|
||||
using GatewayMetrics metricsUnderTest = new();
|
||||
using GatewayMetrics otherMetrics = new();
|
||||
using MeterListener listener = new();
|
||||
List<string> families = [];
|
||||
listener.InstrumentPublished = (instrument, meterListener) =>
|
||||
{
|
||||
// Subscribe to the stream_send histogram on BOTH meters so the listener
|
||||
// would observe a cross-talk measurement if the callback did not filter.
|
||||
if (instrument.Name == "mxgateway.events.stream_send.duration")
|
||||
{
|
||||
meterListener.EnableMeasurementEvents(instrument);
|
||||
}
|
||||
};
|
||||
listener.SetMeasurementEventCallback<double>(
|
||||
(instrument, measurement, tags, _) =>
|
||||
{
|
||||
if (!ReferenceEquals(instrument.Meter, metricsUnderTest.Meter)
|
||||
|| instrument.Name != "mxgateway.events.stream_send.duration")
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
foreach (KeyValuePair<string, object?> tag in tags)
|
||||
{
|
||||
if (tag.Key == "family" && tag.Value is string family)
|
||||
{
|
||||
families.Add(family);
|
||||
}
|
||||
}
|
||||
});
|
||||
listener.Start();
|
||||
|
||||
// Simulate the cross-talk: another test's GatewayMetrics records a value
|
||||
// before the test-under-test does its single event publish. The listener
|
||||
// must filter this out by Meter reference.
|
||||
otherMetrics.RecordEventStreamSend(MxEventFamily.OnWriteComplete.ToString(), TimeSpan.FromMilliseconds(123));
|
||||
|
||||
FakeSessionManager sessionManager = new();
|
||||
sessionManager.Events.Add(CreateWorkerEvent("session-1", workerSequence: 2));
|
||||
MxAccessGatewayService service = CreateService(sessionManager, metrics: metricsUnderTest);
|
||||
RecordingServerStreamWriter<MxEvent> writer = new();
|
||||
|
||||
await service.StreamEvents(
|
||||
new StreamEventsRequest { SessionId = "session-1" },
|
||||
writer,
|
||||
new TestServerCallContext());
|
||||
|
||||
// Only the test-under-test's OnDataChange recording should be observed —
|
||||
// the OnAlarm recording on the sibling meter must NOT leak through.
|
||||
Assert.Equal([MxEventFamily.OnDataChange.ToString()], families);
|
||||
}
|
||||
|
||||
/// <summary>Verifies that CloseSession throws InvalidArgument when session ID is blank.</summary>
|
||||
[Fact]
|
||||
public async Task CloseSession_WithBlankSessionId_ThrowsInvalidArgument()
|
||||
|
||||
@@ -466,7 +466,12 @@ public sealed class SessionManagerTests
|
||||
Assert.Equal(0, metrics.GetSnapshot().OpenSessions);
|
||||
}
|
||||
|
||||
/// <summary>Verifies that killing a worker removes the session from the registry without calling shutdown.</summary>
|
||||
/// <summary>
|
||||
/// Verifies that killing a worker removes the session from the registry without calling shutdown.
|
||||
/// Tests-028: also pins the <c>reason</c> argument propagating through
|
||||
/// <c>SessionManager.KillWorkerAsync</c> → <c>session.KillWorker(reason)</c> →
|
||||
/// <c>IWorkerClient.Kill(reason)</c>.
|
||||
/// </summary>
|
||||
[Fact]
|
||||
public async Task KillWorkerAsync_KillsWorkerAndRemovesSession()
|
||||
{
|
||||
@@ -480,12 +485,54 @@ public sealed class SessionManagerTests
|
||||
Assert.False(result.AlreadyClosed);
|
||||
Assert.Equal(SessionState.Closed, result.FinalState);
|
||||
Assert.Equal(1, workerClient.KillCount);
|
||||
Assert.Equal("test-kill", workerClient.LastKillReason);
|
||||
Assert.Equal(0, workerClient.ShutdownCount);
|
||||
Assert.False(manager.TryGetSession(session.SessionId, out _));
|
||||
Assert.Equal(1, metrics.GetSnapshot().SessionsClosed);
|
||||
Assert.Equal(0, metrics.GetSnapshot().OpenSessions);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Tests-028: <see cref="SessionManager.KillWorkerAsync"/> guards its <c>reason</c> argument with
|
||||
/// <see cref="ArgumentException.ThrowIfNullOrWhiteSpace"/>. A blank or whitespace reason must throw
|
||||
/// <see cref="ArgumentException"/> before any session lookup or worker call runs.
|
||||
/// </summary>
|
||||
[Theory]
|
||||
[InlineData("")]
|
||||
[InlineData(" ")]
|
||||
[InlineData("\t")]
|
||||
public async Task KillWorkerAsync_WithBlankReason_ThrowsArgumentException(string blankReason)
|
||||
{
|
||||
FakeWorkerClient workerClient = new();
|
||||
SessionManager manager = CreateManager(new FakeSessionWorkerClientFactory(workerClient));
|
||||
GatewaySession session = await manager.OpenSessionAsync(CreateOpenRequest(), "client-1", CancellationToken.None);
|
||||
|
||||
await Assert.ThrowsAsync<ArgumentException>(
|
||||
async () => await manager.KillWorkerAsync(session.SessionId, blankReason, CancellationToken.None));
|
||||
|
||||
Assert.Equal(0, workerClient.KillCount);
|
||||
Assert.True(manager.TryGetSession(session.SessionId, out _));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Tests-028: <see cref="ArgumentException.ThrowIfNullOrWhiteSpace"/> also rejects null.
|
||||
/// <see cref="Theory"/> with <see cref="InlineDataAttribute"/> cannot carry <c>null</c> for a
|
||||
/// non-nullable string parameter on .NET 10, so the null case is its own fact.
|
||||
/// </summary>
|
||||
[Fact]
|
||||
public async Task KillWorkerAsync_WithNullReason_ThrowsArgumentNullException()
|
||||
{
|
||||
FakeWorkerClient workerClient = new();
|
||||
SessionManager manager = CreateManager(new FakeSessionWorkerClientFactory(workerClient));
|
||||
GatewaySession session = await manager.OpenSessionAsync(CreateOpenRequest(), "client-1", CancellationToken.None);
|
||||
|
||||
await Assert.ThrowsAsync<ArgumentNullException>(
|
||||
async () => await manager.KillWorkerAsync(session.SessionId, null!, CancellationToken.None));
|
||||
|
||||
Assert.Equal(0, workerClient.KillCount);
|
||||
Assert.True(manager.TryGetSession(session.SessionId, out _));
|
||||
}
|
||||
|
||||
/// <summary>Verifies that killing the worker for an unknown session raises SessionNotFound.</summary>
|
||||
[Fact]
|
||||
public async Task KillWorkerAsync_WhenSessionMissing_ThrowsSessionNotFound()
|
||||
@@ -827,6 +874,15 @@ public sealed class SessionManagerTests
|
||||
/// <summary>Gets the number of times kill was called on the fake worker client.</summary>
|
||||
public int KillCount { get; private set; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the last reason argument observed by <see cref="Kill"/>. Tests-028:
|
||||
/// pins the reason-string propagation through
|
||||
/// <c>SessionManager.KillWorkerAsync</c> → <c>session.KillWorker(reason)</c> →
|
||||
/// <c>IWorkerClient.Kill(reason)</c>. Without this, the chain could silently
|
||||
/// drop or substitute the reason argument and existing tests would still pass.
|
||||
/// </summary>
|
||||
public string? LastKillReason { get; private set; }
|
||||
|
||||
/// <summary>Gets the number of times dispose was called on the fake worker client.</summary>
|
||||
public int DisposeCount { get; private set; }
|
||||
|
||||
@@ -913,6 +969,7 @@ public sealed class SessionManagerTests
|
||||
public void Kill(string reason)
|
||||
{
|
||||
KillCount++;
|
||||
LastKillReason = reason;
|
||||
if (KillException is not null)
|
||||
{
|
||||
throw KillException;
|
||||
|
||||
Reference in New Issue
Block a user