Resolve Tests-027..031: flake root cause + coverage gaps

Tests-027 GatewayMetrics exposes its internal Meter; the StreamEvents_WhenEventIsWritten_RecordsSendDuration listener now filters by ReferenceEquals(instrument.Meter, metrics.Meter) instead of Meter.Name, so parallel tests with their own GatewayMetrics no longer cross-contaminate the families list. Tests-028 FakeWorkerClient.Kill now captures LastKillReason; SessionManager.KillWorkerAsync tests pin the reason propagation end-to-end and cover the blank/null guard. The DashboardSessionAdminService kill test pins the literal dashboard-admin-kill reason. Tests-029 Added CloseSessionAsync_BlankSessionId_ReturnsFailure to mirror the existing KillWorkerAsync blank-id coverage. Tests-030 DeleteAsync_WhenStoreRefuses_ReportsFriendlyError renamed and extended to assert the dashboard-delete-key audit row with Details = not-found-or-active. Added DeleteAsync_BlankKeyId_ReturnsFailure. Tests-031 DashboardSnapshotPublisher reconnect test now measures the gap from the first throw inside the fake (firstThrowAt) to secondSubscribeAt, isolating Task.Delay from StartAsync / scheduling overhead. All resolved at 2026-05-24; 512/512 gateway tests pass. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-24 09:28:54 -04:00
parent 430187c28b
commit 6bae5ea3a3
7 changed files with 255 additions and 18 deletions
@@ -147,11 +147,20 @@ public sealed class DashboardApiKeyManagementServiceTests
            && entry.Details == "deleted");
    }

+    /// <summary>
+    /// Tests-030: when the admin store refuses the delete (returns <c>false</c>), the service
+    /// still emits a <c>dashboard-delete-key</c> audit entry with <c>Details = "not-found-or-active"</c>
+    /// because <c>AppendAuditAsync</c> runs unconditionally after the store call. A regression that
+    /// moved the audit-append call inside the <c>if (deleted)</c> branch would silently drop the
+    /// audit trail for refused deletes — a real audit-completeness gap. This test pins both the
+    /// friendly-error response AND the unconditional audit entry.
+    /// </summary>
    [Fact]
-    public async Task DeleteAsync_WhenStoreRefuses_ReportsFriendlyError()
+    public async Task DeleteAsync_WhenStoreRefuses_ReportsFriendlyErrorAndAudits()
    {
        FakeApiKeyAdminStore adminStore = new() { DeleteResult = false };
-        DashboardApiKeyManagementService service = CreateService(adminStore);
+        FakeApiKeyAuditStore auditStore = new();
+        DashboardApiKeyManagementService service = CreateService(adminStore, auditStore);

        DashboardApiKeyManagementResult result = await service.DeleteAsync(
            CreateAuthorizedUser(),
@@ -160,6 +169,36 @@ public sealed class DashboardApiKeyManagementServiceTests

        Assert.False(result.Succeeded);
        Assert.Contains("Revoke", result.Message, StringComparison.Ordinal);
+
+        ApiKeyAuditEntry auditEntry = Assert.Single(auditStore.Entries);
+        Assert.Equal("dashboard-delete-key", auditEntry.EventType);
+        Assert.Equal("operator01", auditEntry.KeyId);
+        Assert.Equal("not-found-or-active", auditEntry.Details);
+    }
+
+    /// <summary>
+    /// Tests-030: <see cref="DashboardApiKeyManagementService.DeleteAsync"/> calls
+    /// <c>ValidateKeyId</c> after the authorisation check. A blank key id must fail with the
+    /// shared "API key id is required." message before any store or audit call runs.
+    /// </summary>
+    [Theory]
+    [InlineData("")]
+    [InlineData(" ")]
+    [InlineData("\t")]
+    public async Task DeleteAsync_BlankKeyId_ReturnsFailure(string blankKeyId)
+    {
+        FakeApiKeyAdminStore adminStore = new();
+        FakeApiKeyAuditStore auditStore = new();
+        DashboardApiKeyManagementService service = CreateService(adminStore, auditStore);
+
+        DashboardApiKeyManagementResult result = await service.DeleteAsync(
+            CreateAuthorizedUser(),
+            blankKeyId,
+            CancellationToken.None);
+
+        Assert.False(result.Succeeded);
+        Assert.Equal(0, adminStore.DeleteCount);
+        Assert.Empty(auditStore.Entries);
    }

    /// <summary>
@@ -87,7 +87,12 @@ public sealed class DashboardSessionAdminServiceTests
        Assert.True(result.Succeeded);
        Assert.Equal(1, sessionManager.KillCount);
        Assert.Equal("session-1", sessionManager.LastKilledSessionId);
-        Assert.False(string.IsNullOrWhiteSpace(sessionManager.LastKillReason));
+
+        // Tests-028: pin the literal reason string so a future caller-side change is a deliberate
+        // test update rather than a silent drift. DashboardSessionAdminService passes a hard-coded
+        // "dashboard-admin-kill" so the worker-exit metric (mxgateway.workers.killed) carries a
+        // stable, machine-greppable reason tag.
+        Assert.Equal("dashboard-admin-kill", sessionManager.LastKillReason);
    }

    [Fact]
@@ -105,6 +110,26 @@ public sealed class DashboardSessionAdminServiceTests
        Assert.Equal(0, sessionManager.KillCount);
    }

+    /// <summary>
+    /// Tests-029: <c>CloseSessionAsync</c> has the same blank-session-id guard as
+    /// <c>KillWorkerAsync</c> but previously had no parallel test. Coverage was asymmetric.
+    /// A guard-removal regression on the close path would slip through.
+    /// </summary>
+    [Fact]
+    public async Task CloseSessionAsync_BlankSessionId_ReturnsFailure()
+    {
+        FakeSessionManager sessionManager = new();
+        DashboardSessionAdminService service = CreateService(sessionManager);
+
+        DashboardSessionAdminResult result = await service.CloseSessionAsync(
+            CreateUser(DashboardRoles.Admin),
+            "   ",
+            CancellationToken.None);
+
+        Assert.False(result.Succeeded);
+        Assert.Equal(0, sessionManager.CloseCount);
+    }
+
    [Fact]
    public void CanManage_RejectsUnauthenticatedAndViewer()
    {
@@ -17,6 +17,13 @@ public sealed class DashboardSnapshotPublisherTests
    /// reconnect delay and then re-open the subscription. Before the fix,
    /// the publisher exited on the first non-cancellation exception and
    /// the dashboard's snapshot stream went silent until process restart.
+    ///
+    /// <para>Tests-031: the reconnect-gap measurement is bounded between the
+    /// moment the first subscribe actually <c>throw</c>s and the moment the
+    /// second subscribe begins. Measuring from <c>startedAt</c> (pre-<c>StartAsync</c>)
+    /// baselined scheduling overhead into the budget and made the lower bound
+    /// flaky on slow CI; recording <c>firstThrowAt</c> inside the fake removes
+    /// that baseline so only the <c>Task.Delay(reconnectDelay)</c> contributes.</para>
    /// </summary>
    [Fact]
    public async Task ExecuteAsync_WhenSnapshotServiceThrowsOnce_ReconnectsAfterDelay()
@@ -31,7 +38,6 @@ public sealed class DashboardSnapshotPublisherTests
            reconnectDelay);

        using CancellationTokenSource cts = new();
-        DateTimeOffset startedAt = DateTimeOffset.UtcNow;
        Task execute = publisher.StartAsync(cts.Token);
        await execute.WaitAsync(TestTimeout);

@@ -42,6 +48,8 @@ public sealed class DashboardSnapshotPublisherTests
        await WaitUntilAsync(() => snapshotService.SubscribeCount >= 2);
        await WaitUntilAsync(() => hubContext.SendCount >= 1);

+        DateTimeOffset firstThrowAt = snapshotService.FirstThrowAt
+            ?? throw new InvalidOperationException("First subscribe did not record a throw timestamp.");
        DateTimeOffset secondSubscribeAt = snapshotService.SecondSubscribeAt
            ?? throw new InvalidOperationException("Second subscribe did not record a timestamp.");

@@ -52,10 +60,13 @@ public sealed class DashboardSnapshotPublisherTests
            $"Expected at least 2 subscribe calls, got {snapshotService.SubscribeCount}.");
        Assert.True(hubContext.SendCount >= 1);

-        // The gap between the throw (first subscribe) and the reconnect
-        // (second subscribe) is bounded below by the reconnect delay. We
-        // give a small slack (10ms) for scheduling jitter on slow CI VMs.
-        TimeSpan gap = secondSubscribeAt - startedAt;
+        // Tests-031: the gap is measured from the moment the first subscribe
+        // actually threw (inside the fake) to the moment the second subscribe
+        // began (also inside the fake). This isolates the publisher's
+        // Task.Delay(reconnectDelay) — no StartAsync / scheduling overhead in
+        // the baseline. The 10ms slack absorbs Task.Delay's coarse Windows
+        // timer quantum (~15ms) when the underlying scheduler wakes early.
+        TimeSpan gap = secondSubscribeAt - firstThrowAt;
        Assert.True(gap >= reconnectDelay - TimeSpan.FromMilliseconds(10),
            $"Expected reconnect gap >= {reconnectDelay.TotalMilliseconds}ms; got {gap.TotalMilliseconds}ms.");
    }
@@ -100,6 +111,15 @@ public sealed class DashboardSnapshotPublisherTests
    private sealed class ThrowOnceThenYieldSnapshotService : IDashboardSnapshotService
    {
        public int SubscribeCount { get; private set; }
+
+        /// <summary>
+        /// Tests-031: the wall-clock instant the first <c>WatchSnapshotsAsync</c> throws.
+        /// The reconnect-gap assertion is measured against this timestamp (NOT the
+        /// pre-<c>StartAsync</c> wall clock) so scheduling overhead is not baselined
+        /// into the lower bound.
+        /// </summary>
+        public DateTimeOffset? FirstThrowAt { get; private set; }
+
        public DateTimeOffset? SecondSubscribeAt { get; private set; }

        public DashboardSnapshot GetSnapshot()
@@ -118,6 +138,7 @@ public sealed class DashboardSnapshotPublisherTests
                // First call: throw after a brief yield so the publisher
                // observes us as a live producer that failed.
                await Task.Yield();
+                FirstThrowAt = DateTimeOffset.UtcNow;
                throw new InvalidOperationException("simulated transient snapshot failure");
            }