fix(notification-outbox): give KPI response a failure shape; log status-query faults

This commit is contained in:
Joseph Doherty
2026-05-19 01:55:46 -04:00
parent 82e3eb0e93
commit 77a05a8960
4 changed files with 68 additions and 11 deletions

View File

@@ -82,9 +82,13 @@ public record NotificationKpiRequest(
/// <summary>
/// Central -> Outbox UI: KPI summary for the notification outbox dashboard.
/// On a repository fault <see cref="Success"/> is <c>false</c>, <see cref="ErrorMessage"/>
/// carries the cause, and the KPI fields are zeroed/<c>null</c>.
/// </summary>
public record NotificationKpiResponse(
string CorrelationId,
bool Success,
string? ErrorMessage,
int QueueDepth,
int StuckCount,
int ParkedCount,

View File

@@ -298,9 +298,8 @@ public class NotificationOutboxActor : ReceiveActor, IWithTimers
{
var sender = Sender;
var now = DateTimeOffset.UtcNow;
var stuckThreshold = _options.StuckAgeThreshold;
QueryOutboxAsync(request, now, stuckThreshold).PipeTo(
QueryOutboxAsync(request, now).PipeTo(
sender,
success: response => response,
failure: ex => new NotificationOutboxQueryResponse(
@@ -312,7 +311,7 @@ public class NotificationOutboxActor : ReceiveActor, IWithTimers
}
private async Task<NotificationOutboxQueryResponse> QueryOutboxAsync(
NotificationOutboxQueryRequest request, DateTimeOffset now, TimeSpan stuckThreshold)
NotificationOutboxQueryRequest request, DateTimeOffset now)
{
var filter = new NotificationOutboxFilter(
Status: ParseEnum<NotificationStatus>(request.StatusFilter),
@@ -321,7 +320,7 @@ public class NotificationOutboxActor : ReceiveActor, IWithTimers
ListName: request.ListNameFilter,
SubjectKeyword: request.SubjectKeyword,
StuckOnly: request.StuckOnly,
StuckCutoff: request.StuckOnly ? now - stuckThreshold : null,
StuckCutoff: request.StuckOnly ? StuckCutoff(now) : null,
From: request.From,
To: request.To);
@@ -329,7 +328,7 @@ public class NotificationOutboxActor : ReceiveActor, IWithTimers
var repository = scope.ServiceProvider.GetRequiredService<INotificationOutboxRepository>();
var (rows, totalCount) = await repository.QueryAsync(filter, request.PageNumber, request.PageSize);
var stuckCutoff = now - stuckThreshold;
var stuckCutoff = StuckCutoff(now);
var summaries = rows
.Select(row => new NotificationSummary(
row.NotificationId,
@@ -362,9 +361,17 @@ public class NotificationOutboxActor : ReceiveActor, IWithTimers
StatusQueryAsync(query).PipeTo(
sender,
success: response => response,
failure: _ => new NotificationStatusResponse(
query.CorrelationId, Found: false, Status: string.Empty,
RetryCount: 0, LastError: null, DeliveredAt: null));
failure: ex =>
{
// NotificationStatusResponse has no error field, so a repository fault is
// reported as Found: false — log the fault so a transient DB error is not
// silently indistinguishable from a genuinely-missing notification.
_logger.LogWarning(
ex, "Status query for notification {NotificationId} failed.", query.NotificationId);
return new NotificationStatusResponse(
query.CorrelationId, Found: false, Status: string.Empty,
RetryCount: 0, LastError: null, DeliveredAt: null);
});
}
private async Task<NotificationStatusResponse> StatusQueryAsync(NotificationStatusQuery query)
@@ -482,10 +489,21 @@ public class NotificationOutboxActor : ReceiveActor, IWithTimers
{
var sender = Sender;
var now = DateTimeOffset.UtcNow;
var stuckCutoff = now - _options.StuckAgeThreshold;
var stuckCutoff = StuckCutoff(now);
var deliveredSince = now - _options.DeliveredKpiWindow;
ComputeKpisAsync(request.CorrelationId, stuckCutoff, deliveredSince).PipeTo(sender);
ComputeKpisAsync(request.CorrelationId, stuckCutoff, deliveredSince).PipeTo(
sender,
success: response => response,
failure: ex => new NotificationKpiResponse(
request.CorrelationId,
Success: false,
ErrorMessage: ex.GetBaseException().Message,
QueueDepth: 0,
StuckCount: 0,
ParkedCount: 0,
DeliveredLastInterval: 0,
OldestPendingAge: null));
}
private async Task<NotificationKpiResponse> ComputeKpisAsync(
@@ -497,6 +515,8 @@ public class NotificationOutboxActor : ReceiveActor, IWithTimers
return new NotificationKpiResponse(
correlationId,
Success: true,
ErrorMessage: null,
snapshot.QueueDepth,
snapshot.StuckCount,
snapshot.ParkedCount,
@@ -504,6 +524,12 @@ public class NotificationOutboxActor : ReceiveActor, IWithTimers
snapshot.OldestPendingAge);
}
/// <summary>
/// The instant before which a still-pending notification counts as stuck — <paramref name="now"/>
/// offset back by <see cref="NotificationOutboxOptions.StuckAgeThreshold"/>.
/// </summary>
private DateTimeOffset StuckCutoff(DateTimeOffset now) => now - _options.StuckAgeThreshold;
/// <summary>
/// A notification counts as stuck when it is still in a non-terminal status
/// (<c>Pending</c> or <c>Retrying</c>) and was created before the supplied cutoff.

View File

@@ -169,9 +169,11 @@ public class NotificationMessagesTests
public void NotificationKpiResponse_WithExpression_ChangesSingleField()
{
var kpi = new NotificationKpiResponse(
"corr-1", 10, 2, 1, 5, TimeSpan.FromMinutes(3));
"corr-1", Success: true, ErrorMessage: null, 10, 2, 1, 5, TimeSpan.FromMinutes(3));
var updated = kpi with { QueueDepth = 12 };
Assert.True(kpi.Success);
Assert.Null(kpi.ErrorMessage);
Assert.Equal(10, kpi.QueueDepth);
Assert.Equal(12, updated.QueueDepth);
Assert.Equal(2, updated.StuckCount);

View File

@@ -323,6 +323,8 @@ public class NotificationOutboxActorQueryTests : TestKit
var response = ExpectMsg<NotificationKpiResponse>();
Assert.Equal("corr-11", response.CorrelationId);
Assert.True(response.Success);
Assert.Null(response.ErrorMessage);
Assert.Equal(7, response.QueueDepth);
Assert.Equal(2, response.StuckCount);
Assert.Equal(3, response.ParkedCount);
@@ -332,4 +334,27 @@ public class NotificationOutboxActorQueryTests : TestKit
_repository.Received(1).ComputeKpisAsync(
Arg.Any<DateTimeOffset>(), Arg.Any<DateTimeOffset>(), Arg.Any<CancellationToken>());
}
[Fact]
public void KpiRequest_RepositoryThrows_RepliesFailureResponse()
{
_repository.ComputeKpisAsync(
Arg.Any<DateTimeOffset>(), Arg.Any<DateTimeOffset>(), Arg.Any<CancellationToken>())
.ThrowsAsync(new InvalidOperationException("kpi db down"));
var actor = CreateActor();
actor.Tell(new NotificationKpiRequest("corr-12"), TestActor);
// A repository fault yields a failure NotificationKpiResponse, not a Status.Failure.
var response = ExpectMsg<NotificationKpiResponse>();
Assert.Equal("corr-12", response.CorrelationId);
Assert.False(response.Success);
Assert.NotNull(response.ErrorMessage);
Assert.Contains("kpi db down", response.ErrorMessage);
Assert.Equal(0, response.QueueDepth);
Assert.Equal(0, response.StuckCount);
Assert.Equal(0, response.ParkedCount);
Assert.Equal(0, response.DeliveredLastInterval);
Assert.Null(response.OldestPendingAge);
}
}