feat(audit): M5.2 per-node stuck-count KPIs (T6) — repo per-node aggregation, actor message pair, CentralUI tiles

This commit is contained in:
Joseph Doherty
2026-06-16 21:34:14 -04:00
parent a07ff28f10
commit 209f368cb5
25 changed files with 840 additions and 6 deletions
@@ -300,6 +300,63 @@ VALUES
: null)).ToList();
}
/// <inheritdoc />
public async Task<IReadOnlyList<NodeNotificationKpiSnapshot>> ComputePerNodeKpisAsync(
DateTimeOffset stuckCutoff, DateTimeOffset deliveredSince, CancellationToken cancellationToken = default)
{
var now = DateTimeOffset.UtcNow;
// Exclude rows with NULL SourceNode (legacy / unstamped) — per-node KPIs
// are only meaningful when the node identity is known.
var queueDepth = await CountByNodeAsync(
n => (n.Status == NotificationStatus.Pending || n.Status == NotificationStatus.Retrying)
&& n.SourceNode != null,
cancellationToken);
var stuck = await CountByNodeAsync(
n => (n.Status == NotificationStatus.Pending || n.Status == NotificationStatus.Retrying)
&& n.CreatedAt < stuckCutoff
&& n.SourceNode != null,
cancellationToken);
var parked = await CountByNodeAsync(
n => n.Status == NotificationStatus.Parked && n.SourceNode != null,
cancellationToken);
var delivered = await CountByNodeAsync(
n => n.Status == NotificationStatus.Delivered
&& n.DeliveredAt != null && n.DeliveredAt >= deliveredSince
&& n.SourceNode != null,
cancellationToken);
// Oldest non-terminal CreatedAt per node — same in-memory reduction
// pattern as ComputePerSiteKpisAsync (DateTimeOffset converter makes
// a SQL Min awkward).
var oldest = (await _context.Notifications
.Where(n => (n.Status == NotificationStatus.Pending
|| n.Status == NotificationStatus.Retrying)
&& n.SourceNode != null)
.Select(n => new { n.SourceNode, n.CreatedAt })
.ToListAsync(cancellationToken))
.GroupBy(x => x.SourceNode!)
.ToDictionary(g => g.Key, g => g.Min(x => x.CreatedAt));
var nodeNames = queueDepth.Keys
.Concat(stuck.Keys).Concat(parked.Keys).Concat(delivered.Keys)
.Distinct()
.OrderBy(n => n, StringComparer.Ordinal);
return nodeNames.Select(node => new NodeNotificationKpiSnapshot(
SourceNode: node,
QueueDepth: queueDepth.GetValueOrDefault(node),
StuckCount: stuck.GetValueOrDefault(node),
ParkedCount: parked.GetValueOrDefault(node),
DeliveredLastInterval: delivered.GetValueOrDefault(node),
OldestPendingAge: oldest.TryGetValue(node, out var createdAt)
? now - createdAt
: null)).ToList();
}
/// <summary>Counts notification rows matching <paramref name="predicate"/>, grouped by source site.</summary>
private async Task<Dictionary<string, int>> CountBySiteAsync(
System.Linq.Expressions.Expression<Func<Notification, bool>> predicate,
@@ -312,6 +369,22 @@ VALUES
.ToDictionaryAsync(x => x.Site, x => x.Count, cancellationToken);
}
/// <summary>
/// Counts notification rows matching <paramref name="predicate"/>, grouped by source node.
/// Only rows with a non-null <c>SourceNode</c> should be included; the predicate is
/// responsible for enforcing that guard.
/// </summary>
private async Task<Dictionary<string, int>> CountByNodeAsync(
System.Linq.Expressions.Expression<Func<Notification, bool>> predicate,
CancellationToken cancellationToken)
{
return await _context.Notifications
.Where(predicate)
.GroupBy(n => n.SourceNode!)
.Select(g => new { Node = g.Key, Count = g.Count() })
.ToDictionaryAsync(x => x.Node, x => x.Count, cancellationToken);
}
/// <inheritdoc />
public async Task<int> SaveChangesAsync(CancellationToken cancellationToken = default)
=> await _context.SaveChangesAsync(cancellationToken);
@@ -324,6 +324,61 @@ ORDER BY CreatedAtUtc DESC, TrackedOperationId DESC;";
StuckCount: stuck.GetValueOrDefault(site))).ToList();
}
/// <inheritdoc />
public async Task<IReadOnlyList<SiteCallNodeKpiSnapshot>> ComputePerNodeKpisAsync(
DateTime stuckCutoff, DateTime intervalSince, CancellationToken ct = default)
{
var now = DateTime.UtcNow;
// Exclude rows with NULL SourceNode — per-node KPIs are only meaningful
// when the node identity is known. Each predicate guards n.SourceNode != null
// so the GROUP BY key is always non-null.
var buffered = await CountByNodeAsync(
s => s.TerminalAtUtc == null && s.SourceNode != null, ct);
var parked = await CountByNodeAsync(
s => s.Status == StatusParked && s.SourceNode != null, ct);
var failed = await CountByNodeAsync(
s => s.Status == StatusFailed
&& s.TerminalAtUtc != null && s.TerminalAtUtc >= intervalSince
&& s.SourceNode != null, ct);
var delivered = await CountByNodeAsync(
s => s.Status == StatusDelivered
&& s.TerminalAtUtc != null && s.TerminalAtUtc >= intervalSince
&& s.SourceNode != null, ct);
var stuck = await CountByNodeAsync(
s => s.TerminalAtUtc == null && s.CreatedAtUtc < stuckCutoff
&& s.SourceNode != null, ct);
// Oldest non-terminal CreatedAtUtc per node — server-side GROUP BY MIN.
var oldest = (await _context.SiteCalls
.Where(s => s.TerminalAtUtc == null && s.SourceNode != null)
.GroupBy(s => s.SourceNode!)
.Select(g => new { Node = g.Key, Oldest = g.Min(s => s.CreatedAtUtc) })
.ToListAsync(ct))
.ToDictionary(x => x.Node, x => x.Oldest);
var nodeNames = buffered.Keys
.Concat(parked.Keys).Concat(failed.Keys)
.Concat(delivered.Keys).Concat(stuck.Keys)
.Distinct()
.OrderBy(n => n, StringComparer.Ordinal);
return nodeNames.Select(node => new SiteCallNodeKpiSnapshot(
SourceNode: node,
BufferedCount: buffered.GetValueOrDefault(node),
ParkedCount: parked.GetValueOrDefault(node),
FailedLastInterval: failed.GetValueOrDefault(node),
DeliveredLastInterval: delivered.GetValueOrDefault(node),
OldestPendingAge: oldest.TryGetValue(node, out var createdAt)
? now - createdAt
: null,
StuckCount: stuck.GetValueOrDefault(node))).ToList();
}
/// <summary>Counts <c>SiteCalls</c> rows matching <paramref name="predicate"/>, grouped by source site.</summary>
private async Task<Dictionary<string, int>> CountBySiteAsync(
System.Linq.Expressions.Expression<Func<SiteCall, bool>> predicate,
@@ -336,6 +391,22 @@ ORDER BY CreatedAtUtc DESC, TrackedOperationId DESC;";
.ToDictionaryAsync(x => x.Site, x => x.Count, ct);
}
/// <summary>
/// Counts <c>SiteCalls</c> rows matching <paramref name="predicate"/>, grouped by source node.
/// Only rows with a non-null <c>SourceNode</c> should be included; the predicate is
/// responsible for enforcing that guard.
/// </summary>
private async Task<Dictionary<string, int>> CountByNodeAsync(
System.Linq.Expressions.Expression<Func<SiteCall, bool>> predicate,
CancellationToken ct)
{
return await _context.SiteCalls
.Where(predicate)
.GroupBy(s => s.SourceNode!)
.Select(g => new { Node = g.Key, Count = g.Count() })
.ToDictionaryAsync(x => x.Node, x => x.Count, ct);
}
private static int GetRankOrThrow(string status)
{
if (!StatusRank.TryGetValue(status, out var rank))