feat(audit): M5.2 per-node stuck-count KPIs (T6) — repo per-node aggregation, actor message pair, CentralUI tiles

This commit is contained in:
Joseph Doherty
2026-06-16 21:34:14 -04:00
parent a07ff28f10
commit 209f368cb5
25 changed files with 840 additions and 6 deletions
@@ -58,3 +58,31 @@
{
<div class="text-muted small mb-3">Site Call KPIs unavailable: @ErrorMessage</div>
}
@* ── Per-node stuck/parked sub-table (T6: M5.2 per-node stuck-count KPIs) ── *@
@if (HasNodeBreakdown)
{
<div class="mb-3">
<div class="d-flex justify-content-between align-items-center mb-1">
<small class="text-muted">By node</small>
</div>
<table class="table table-sm table-borderless mb-0 site-call-kpi-node-table">
<thead class="table-light">
<tr>
<th class="small py-1">Node</th>
<th class="text-end small py-1">Stuck</th>
<th class="text-end small py-1">Parked</th>
</tr>
</thead>
<tbody>
@foreach (var n in PerNodeSnapshots!)
{
<tr @key="n.SourceNode">
<td class="small py-1"><code>@n.SourceNode</code></td>
<td class="text-end font-monospace small py-1 @(n.StuckCount > 0 ? "text-warning" : "")">@n.StuckCount</td>
<td class="text-end font-monospace small py-1 @(n.ParkedCount > 0 ? "text-danger" : "")">@n.ParkedCount</td>
</tr>
}
</tbody>
</table>
</div>
}
@@ -1,5 +1,6 @@
using Microsoft.AspNetCore.Components;
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Audit;
using ZB.MOM.WW.ScadaBridge.Commons.Types.Audit;
namespace ZB.MOM.WW.ScadaBridge.CentralUI.Components.Health;
@@ -59,6 +60,24 @@ public partial class SiteCallKpiTiles
/// </summary>
[Parameter] public string? ErrorMessage { get; set; }
/// <summary>
/// Optional per-node KPI breakdown (T6: M5.2 per-node stuck-count KPIs).
/// When non-null and non-empty, a compact node-level stuck/parked sub-table
/// is rendered below the main tiles. <c>null</c> means the parent has not
/// loaded it yet or has opted out — the sub-table is suppressed entirely.
/// </summary>
[Parameter] public IReadOnlyList<SiteCallNodeKpiSnapshot>? PerNodeSnapshots { get; set; }
/// <summary>
/// True when <see cref="PerNodeSnapshots"/> is a successful query result.
/// Used to suppress the sub-table on a load failure.
/// </summary>
[Parameter] public bool PerNodeAvailable { get; set; }
/// <summary>Whether the per-node sub-table has data to render.</summary>
internal bool HasNodeBreakdown =>
PerNodeAvailable && PerNodeSnapshots is { Count: > 0 };
// ── Buffered tile ───────────────────────────────────────────────────────
private string BufferedDisplay =>
@@ -9,6 +9,7 @@
@using ZB.MOM.WW.ScadaBridge.HealthMonitoring
@using ZB.MOM.WW.ScadaBridge.Commons.Messages.Notification
@using ZB.MOM.WW.ScadaBridge.Commons.Messages.Audit
@using ZB.MOM.WW.ScadaBridge.Commons.Types.Audit
@using ZB.MOM.WW.ScadaBridge.Communication
@implements IDisposable
@inject ICentralHealthAggregator HealthAggregator
@@ -65,7 +66,9 @@
(buffered / stuck / parked). Refreshed alongside the site states. *@
<SiteCallKpiTiles Snapshot="@_siteCallKpi"
IsAvailable="@_siteCallKpiAvailable"
ErrorMessage="@_siteCallKpiError" />
ErrorMessage="@_siteCallKpiError"
PerNodeSnapshots="@_siteCallNodeKpis"
PerNodeAvailable="@_siteCallNodeKpiAvailable" />
@* Audit Log (#23) M7 Bundle E — three KPI tiles for the Audit channel
(volume / error rate / backlog). Refreshed alongside the site states. *@
@@ -378,6 +381,12 @@
private bool _siteCallKpiAvailable;
private string? _siteCallKpiError;
// Per-node Site Call KPI breakdown (T6: M5.2 per-node stuck-count KPIs).
// Passed to SiteCallKpiTiles as an optional sub-table.
private IReadOnlyList<SiteCallNodeKpiSnapshot> _siteCallNodeKpis =
Array.Empty<SiteCallNodeKpiSnapshot>();
private bool _siteCallNodeKpiAvailable;
private static bool SiteHasActiveErrors(SiteHealthState state)
{
var report = state.LatestReport;
@@ -415,7 +424,7 @@
{
_siteStates = HealthAggregator.GetAllSiteStates();
await LoadOutboxKpis();
await LoadSiteCallKpis();
await Task.WhenAll(LoadSiteCallKpis(), LoadSiteCallNodeKpis());
await LoadAuditKpis();
}
@@ -474,6 +483,30 @@
}
}
// Per-node site-call KPI loader (T6: M5.2). Best-effort; a fault silently
// suppresses the per-node sub-table rather than degrading the dashboard.
private async Task LoadSiteCallNodeKpis()
{
try
{
var response = await CommunicationService.GetPerNodeSiteCallKpisAsync(
new PerNodeSiteCallKpiRequest(Guid.NewGuid().ToString("N")));
if (response.Success)
{
_siteCallNodeKpis = response.Nodes;
_siteCallNodeKpiAvailable = true;
}
else
{
_siteCallNodeKpiAvailable = false;
}
}
catch
{
_siteCallNodeKpiAvailable = false;
}
}
// Tiles show the numeric KPI when available, or an em dash when the outbox
// KPI query failed — matching how the page renders other unavailable data.
private string OutboxTileValue(int value) =>
@@ -69,6 +69,51 @@
</div>
}
@* ── Per-node breakdown (T6: additive) ── *@
<h5 class="mb-2">Per-node breakdown</h5>
@if (_perNodeError != null)
{
<div class="alert alert-warning py-2">Per-node KPIs unavailable: @_perNodeError</div>
}
else if (_perNode.Count == 0)
{
<div class="card mb-3">
<div class="card-body text-center text-muted py-3">
<div class="small">No per-node activity (rows may have a null SourceNode).</div>
</div>
</div>
}
else
{
<div class="table-responsive mb-3">
<table class="table table-sm table-hover align-middle">
<thead class="table-light">
<tr>
<th>Node</th>
<th class="text-end">Queue Depth</th>
<th class="text-end">Stuck</th>
<th class="text-end">Parked</th>
<th class="text-end">Delivered (last interval)</th>
<th class="text-end">Oldest Pending Age</th>
</tr>
</thead>
<tbody>
@foreach (var n in _perNode)
{
<tr @key="n.SourceNode" class="@(n.StuckCount > 0 ? "table-warning" : "")">
<td><code>@n.SourceNode</code></td>
<td class="text-end font-monospace">@n.QueueDepth</td>
<td class="text-end font-monospace @(n.StuckCount > 0 ? "text-warning" : "")">@n.StuckCount</td>
<td class="text-end font-monospace @(n.ParkedCount > 0 ? "text-danger" : "")">@n.ParkedCount</td>
<td class="text-end font-monospace text-success">@n.DeliveredLastInterval</td>
<td class="text-end font-monospace">@FormatAge(n.OldestPendingAge)</td>
</tr>
}
</tbody>
</table>
</div>
}
@* ── Per-site breakdown ── *@
<h5 class="mb-2">Per-site breakdown</h5>
@if (_perSiteError != null)
@@ -124,6 +169,10 @@
private IReadOnlyList<SiteNotificationKpiSnapshot> _perSite = Array.Empty<SiteNotificationKpiSnapshot>();
private string? _perSiteError;
// ── Per-node (T6: M5.2 per-node stuck-count KPIs) ──
private IReadOnlyList<NodeNotificationKpiSnapshot> _perNode = Array.Empty<NodeNotificationKpiSnapshot>();
private string? _perNodeError;
private bool _loading;
protected override async Task OnInitializedAsync()
@@ -144,9 +193,9 @@
private async Task RefreshAll()
{
_loading = true;
// Race-free despite both tasks mutating component fields: Blazor Server runs
// Race-free despite all tasks mutating component fields: Blazor Server runs
// every continuation on the circuit's single-threaded synchronization context.
await Task.WhenAll(LoadGlobalKpis(), LoadPerSiteKpis());
await Task.WhenAll(LoadGlobalKpis(), LoadPerSiteKpis(), LoadPerNodeKpis());
_loading = false;
}
@@ -194,6 +243,28 @@
}
}
private async Task LoadPerNodeKpis()
{
try
{
var response = await CommunicationService.GetPerNodeNotificationKpisAsync(
new PerNodeNotificationKpiRequest(Guid.NewGuid().ToString("N")));
if (response.Success)
{
_perNode = response.Nodes;
_perNodeError = null;
}
else
{
_perNodeError = response.ErrorMessage ?? "Per-node KPI query failed.";
}
}
catch (Exception ex)
{
_perNodeError = $"Per-node KPI query failed: {ex.Message}";
}
}
private string SiteName(string siteId) =>
_sites.FirstOrDefault(s => s.SiteIdentifier == siteId)?.Name ?? siteId;
@@ -100,6 +100,19 @@ public interface INotificationOutboxRepository
Task<IReadOnlyList<SiteNotificationKpiSnapshot>> ComputePerSiteKpisAsync(
DateTimeOffset stuckCutoff, DateTimeOffset deliveredSince, CancellationToken cancellationToken = default);
/// <summary>
/// Computes a point-in-time <see cref="NodeNotificationKpiSnapshot"/> per originating node.
/// Nodes with no notification rows at all are omitted; rows with a <c>NULL</c>
/// <c>SourceNode</c> are excluded. The stuck and delivered cutoffs are supplied by the
/// caller; the current time used for <c>OldestPendingAge</c> is captured inside the method.
/// </summary>
/// <param name="stuckCutoff">The time threshold for marking notifications as stuck.</param>
/// <param name="deliveredSince">The time threshold for counting delivered notifications.</param>
/// <param name="cancellationToken">Cancellation token.</param>
/// <returns>A list of per-node KPI snapshots, ordered by node name.</returns>
Task<IReadOnlyList<NodeNotificationKpiSnapshot>> ComputePerNodeKpisAsync(
DateTimeOffset stuckCutoff, DateTimeOffset deliveredSince, CancellationToken cancellationToken = default);
/// <summary>
/// Persists pending changes tracked on the underlying context. Use this when staging
/// multiple changes for a single commit; the individual mutating methods on this
@@ -107,4 +107,19 @@ public interface ISiteCallAuditRepository
DateTime stuckCutoff,
DateTime intervalSince,
CancellationToken ct = default);
/// <summary>
/// Computes a point-in-time <see cref="SiteCallNodeKpiSnapshot"/> per originating
/// node. Nodes with no <c>SiteCalls</c> rows at all are omitted; rows with a
/// <c>NULL</c> <c>SourceNode</c> are excluded. The stuck cutoff and interval
/// bounds are interpreted as in <see cref="ComputeKpisAsync"/>.
/// </summary>
/// <param name="stuckCutoff">UTC threshold for classifying a row as stuck.</param>
/// <param name="intervalSince">UTC start of the delivered/failed interval window.</param>
/// <param name="ct">Cancellation token.</param>
/// <returns>A task that resolves to a per-node KPI list; nodes with no rows are omitted.</returns>
Task<IReadOnlyList<SiteCallNodeKpiSnapshot>> ComputePerNodeKpisAsync(
DateTime stuckCutoff,
DateTime intervalSince,
CancellationToken ct = default);
}
@@ -164,3 +164,24 @@ public sealed record PerSiteSiteCallKpiResponse(
bool Success,
string? ErrorMessage,
IReadOnlyList<SiteCallSiteKpiSnapshot> Sites);
/// <summary>
/// Site Calls UI -> Central: request for the per-node <c>SiteCalls</c>
/// KPI breakdown. Mirrors <see cref="PerSiteSiteCallKpiRequest"/> but groups
/// by <c>SourceNode</c> instead of <c>SourceSite</c>. Additive — does not
/// change per-site behaviour.
/// </summary>
public sealed record PerNodeSiteCallKpiRequest(
string CorrelationId);
/// <summary>
/// Central -> Site Calls UI: per-node KPI breakdown for the Site Calls KPIs
/// page. On a repository fault <see cref="Success"/> is <c>false</c>,
/// <see cref="ErrorMessage"/> carries the cause, and <see cref="Nodes"/> is empty.
/// Nodes with a <c>NULL</c> <c>SourceNode</c> are omitted.
/// </summary>
public sealed record PerNodeSiteCallKpiResponse(
string CorrelationId,
bool Success,
string? ErrorMessage,
IReadOnlyList<SiteCallNodeKpiSnapshot> Nodes);
@@ -159,3 +159,23 @@ public record PerSiteNotificationKpiResponse(
bool Success,
string? ErrorMessage,
IReadOnlyList<SiteNotificationKpiSnapshot> Sites);
/// <summary>
/// Outbox UI -> Central: request for the per-node notification outbox KPI breakdown.
/// Mirrors <see cref="PerSiteNotificationKpiRequest"/> but groups by <c>SourceNode</c>
/// instead of <c>SourceSiteId</c>. Additive — does not change per-site behaviour.
/// </summary>
public record PerNodeNotificationKpiRequest(
string CorrelationId);
/// <summary>
/// Central -> Outbox UI: per-node KPI breakdown for the Notification KPIs page.
/// On a repository fault <see cref="Success"/> is <c>false</c>, <see cref="ErrorMessage"/>
/// carries the cause, and <see cref="Nodes"/> is empty. Nodes with a <c>NULL</c>
/// <c>SourceNode</c> are omitted.
/// </summary>
public record PerNodeNotificationKpiResponse(
string CorrelationId,
bool Success,
string? ErrorMessage,
IReadOnlyList<NodeNotificationKpiSnapshot> Nodes);
@@ -0,0 +1,37 @@
namespace ZB.MOM.WW.ScadaBridge.Commons.Types.Audit;
/// <summary>
/// Point-in-time <c>SiteCalls</c> metrics scoped to a single originating node. The
/// per-node counterpart of <see cref="SiteCallSiteKpiSnapshot"/>; surfaced in the
/// per-node breakdown table on the Site Calls KPIs page. Mirrors
/// <see cref="ZB.MOM.WW.ScadaBridge.Commons.Types.Notifications.NodeNotificationKpiSnapshot"/>.
/// </summary>
/// <param name="SourceNode">
/// The node identifier these metrics are scoped to (e.g. <c>node-a</c>,
/// <c>node-b</c>). Rows with a <c>NULL</c> <c>SourceNode</c> are omitted.
/// </param>
/// <param name="BufferedCount">Count of this node's non-terminal rows (<c>TerminalAtUtc IS NULL</c>).</param>
/// <param name="ParkedCount">Count of this node's rows in the <c>Parked</c> status.</param>
/// <param name="FailedLastInterval">
/// Count of this node's <c>Failed</c> rows whose <c>TerminalAtUtc</c> is at or
/// after the "since" timestamp.
/// </param>
/// <param name="DeliveredLastInterval">
/// Count of this node's <c>Delivered</c> rows whose <c>TerminalAtUtc</c> is at
/// or after the "since" timestamp.
/// </param>
/// <param name="OldestPendingAge">
/// Age of this node's oldest non-terminal row, or <c>null</c> when it has none.
/// </param>
/// <param name="StuckCount">
/// Count of this node's non-terminal rows whose <c>CreatedAtUtc</c> is older
/// than the stuck cutoff.
/// </param>
public sealed record SiteCallNodeKpiSnapshot(
string SourceNode,
int BufferedCount,
int ParkedCount,
int FailedLastInterval,
int DeliveredLastInterval,
TimeSpan? OldestPendingAge,
int StuckCount);
@@ -0,0 +1,30 @@
namespace ZB.MOM.WW.ScadaBridge.Commons.Types.Notifications;
/// <summary>
/// Point-in-time notification-outbox metrics scoped to a single originating node.
/// The per-node counterpart of <see cref="SiteNotificationKpiSnapshot"/>; surfaced
/// in the per-node breakdown table on the Notification KPIs page.
/// </summary>
/// <param name="SourceNode">
/// The node identifier these metrics are scoped to (e.g. <c>node-a</c>,
/// <c>node-b</c>). Rows with a <c>NULL</c> <c>SourceNode</c> are omitted.
/// </param>
/// <param name="QueueDepth">Count of this node's non-terminal rows (Pending + Retrying).</param>
/// <param name="StuckCount">
/// Count of this node's non-terminal rows whose <c>CreatedAt</c> is older than the stuck cutoff.
/// </param>
/// <param name="ParkedCount">Count of this node's rows in the Parked status.</param>
/// <param name="DeliveredLastInterval">
/// Count of this node's Delivered rows whose <c>DeliveredAt</c> is at or after the
/// "delivered since" timestamp.
/// </param>
/// <param name="OldestPendingAge">
/// Age of this node's oldest non-terminal row, or <c>null</c> when it has none.
/// </param>
public record NodeNotificationKpiSnapshot(
string SourceNode,
int QueueDepth,
int StuckCount,
int ParkedCount,
int DeliveredLastInterval,
TimeSpan? OldestPendingAge);
@@ -525,6 +525,22 @@ public class CommunicationService
request, _options.QueryTimeout, cancellationToken);
}
/// <summary>
/// Gets per-node KPI metrics for the notification outbox.
/// Groups by <c>SourceNode</c> (e.g. <c>node-a</c>/<c>node-b</c>); rows with
/// a <c>NULL</c> node are omitted. Additive alongside
/// <see cref="GetPerSiteNotificationKpisAsync"/>.
/// </summary>
/// <param name="request">The per-node notification KPI request.</param>
/// <param name="cancellationToken">Cancellation token.</param>
/// <returns>The per-node notification KPI response.</returns>
public async Task<PerNodeNotificationKpiResponse> GetPerNodeNotificationKpisAsync(
PerNodeNotificationKpiRequest request, CancellationToken cancellationToken = default)
{
return await GetNotificationOutbox().Ask<PerNodeNotificationKpiResponse>(
request, _options.QueryTimeout, cancellationToken);
}
// ── Site Call Audit (central-local actor — Asked directly, no SiteEnvelope) ──
/// <summary>
@@ -579,6 +595,21 @@ public class CommunicationService
request, _options.QueryTimeout, cancellationToken);
}
/// <summary>
/// Gets per-node KPI metrics for site calls. Groups by <c>SourceNode</c>
/// (e.g. <c>node-a</c>/<c>node-b</c>); rows with a <c>NULL</c> node are
/// omitted. Additive alongside <see cref="GetPerSiteSiteCallKpisAsync"/>.
/// </summary>
/// <param name="request">The per-node site call KPI request.</param>
/// <param name="cancellationToken">Cancellation token.</param>
/// <returns>The per-node site call KPI response.</returns>
public async Task<PerNodeSiteCallKpiResponse> GetPerNodeSiteCallKpisAsync(
PerNodeSiteCallKpiRequest request, CancellationToken cancellationToken = default)
{
return await GetSiteCallAudit().Ask<PerNodeSiteCallKpiResponse>(
request, _options.QueryTimeout, cancellationToken);
}
/// <summary>
/// Task 5 (#22): relays an operator Retry of a parked cached call to its
/// owning site. The <c>SiteCallAuditActor</c> is Asked directly (it is
@@ -300,6 +300,63 @@ VALUES
: null)).ToList();
}
/// <inheritdoc />
public async Task<IReadOnlyList<NodeNotificationKpiSnapshot>> ComputePerNodeKpisAsync(
DateTimeOffset stuckCutoff, DateTimeOffset deliveredSince, CancellationToken cancellationToken = default)
{
var now = DateTimeOffset.UtcNow;
// Exclude rows with NULL SourceNode (legacy / unstamped) — per-node KPIs
// are only meaningful when the node identity is known.
var queueDepth = await CountByNodeAsync(
n => (n.Status == NotificationStatus.Pending || n.Status == NotificationStatus.Retrying)
&& n.SourceNode != null,
cancellationToken);
var stuck = await CountByNodeAsync(
n => (n.Status == NotificationStatus.Pending || n.Status == NotificationStatus.Retrying)
&& n.CreatedAt < stuckCutoff
&& n.SourceNode != null,
cancellationToken);
var parked = await CountByNodeAsync(
n => n.Status == NotificationStatus.Parked && n.SourceNode != null,
cancellationToken);
var delivered = await CountByNodeAsync(
n => n.Status == NotificationStatus.Delivered
&& n.DeliveredAt != null && n.DeliveredAt >= deliveredSince
&& n.SourceNode != null,
cancellationToken);
// Oldest non-terminal CreatedAt per node — same in-memory reduction
// pattern as ComputePerSiteKpisAsync (DateTimeOffset converter makes
// a SQL Min awkward).
var oldest = (await _context.Notifications
.Where(n => (n.Status == NotificationStatus.Pending
|| n.Status == NotificationStatus.Retrying)
&& n.SourceNode != null)
.Select(n => new { n.SourceNode, n.CreatedAt })
.ToListAsync(cancellationToken))
.GroupBy(x => x.SourceNode!)
.ToDictionary(g => g.Key, g => g.Min(x => x.CreatedAt));
var nodeNames = queueDepth.Keys
.Concat(stuck.Keys).Concat(parked.Keys).Concat(delivered.Keys)
.Distinct()
.OrderBy(n => n, StringComparer.Ordinal);
return nodeNames.Select(node => new NodeNotificationKpiSnapshot(
SourceNode: node,
QueueDepth: queueDepth.GetValueOrDefault(node),
StuckCount: stuck.GetValueOrDefault(node),
ParkedCount: parked.GetValueOrDefault(node),
DeliveredLastInterval: delivered.GetValueOrDefault(node),
OldestPendingAge: oldest.TryGetValue(node, out var createdAt)
? now - createdAt
: null)).ToList();
}
/// <summary>Counts notification rows matching <paramref name="predicate"/>, grouped by source site.</summary>
private async Task<Dictionary<string, int>> CountBySiteAsync(
System.Linq.Expressions.Expression<Func<Notification, bool>> predicate,
@@ -312,6 +369,22 @@ VALUES
.ToDictionaryAsync(x => x.Site, x => x.Count, cancellationToken);
}
/// <summary>
/// Counts notification rows matching <paramref name="predicate"/>, grouped by source node.
/// Only rows with a non-null <c>SourceNode</c> should be included; the predicate is
/// responsible for enforcing that guard.
/// </summary>
private async Task<Dictionary<string, int>> CountByNodeAsync(
System.Linq.Expressions.Expression<Func<Notification, bool>> predicate,
CancellationToken cancellationToken)
{
return await _context.Notifications
.Where(predicate)
.GroupBy(n => n.SourceNode!)
.Select(g => new { Node = g.Key, Count = g.Count() })
.ToDictionaryAsync(x => x.Node, x => x.Count, cancellationToken);
}
/// <inheritdoc />
public async Task<int> SaveChangesAsync(CancellationToken cancellationToken = default)
=> await _context.SaveChangesAsync(cancellationToken);
@@ -324,6 +324,61 @@ ORDER BY CreatedAtUtc DESC, TrackedOperationId DESC;";
StuckCount: stuck.GetValueOrDefault(site))).ToList();
}
/// <inheritdoc />
public async Task<IReadOnlyList<SiteCallNodeKpiSnapshot>> ComputePerNodeKpisAsync(
DateTime stuckCutoff, DateTime intervalSince, CancellationToken ct = default)
{
var now = DateTime.UtcNow;
// Exclude rows with NULL SourceNode — per-node KPIs are only meaningful
// when the node identity is known. Each predicate guards n.SourceNode != null
// so the GROUP BY key is always non-null.
var buffered = await CountByNodeAsync(
s => s.TerminalAtUtc == null && s.SourceNode != null, ct);
var parked = await CountByNodeAsync(
s => s.Status == StatusParked && s.SourceNode != null, ct);
var failed = await CountByNodeAsync(
s => s.Status == StatusFailed
&& s.TerminalAtUtc != null && s.TerminalAtUtc >= intervalSince
&& s.SourceNode != null, ct);
var delivered = await CountByNodeAsync(
s => s.Status == StatusDelivered
&& s.TerminalAtUtc != null && s.TerminalAtUtc >= intervalSince
&& s.SourceNode != null, ct);
var stuck = await CountByNodeAsync(
s => s.TerminalAtUtc == null && s.CreatedAtUtc < stuckCutoff
&& s.SourceNode != null, ct);
// Oldest non-terminal CreatedAtUtc per node — server-side GROUP BY MIN.
var oldest = (await _context.SiteCalls
.Where(s => s.TerminalAtUtc == null && s.SourceNode != null)
.GroupBy(s => s.SourceNode!)
.Select(g => new { Node = g.Key, Oldest = g.Min(s => s.CreatedAtUtc) })
.ToListAsync(ct))
.ToDictionary(x => x.Node, x => x.Oldest);
var nodeNames = buffered.Keys
.Concat(parked.Keys).Concat(failed.Keys)
.Concat(delivered.Keys).Concat(stuck.Keys)
.Distinct()
.OrderBy(n => n, StringComparer.Ordinal);
return nodeNames.Select(node => new SiteCallNodeKpiSnapshot(
SourceNode: node,
BufferedCount: buffered.GetValueOrDefault(node),
ParkedCount: parked.GetValueOrDefault(node),
FailedLastInterval: failed.GetValueOrDefault(node),
DeliveredLastInterval: delivered.GetValueOrDefault(node),
OldestPendingAge: oldest.TryGetValue(node, out var createdAt)
? now - createdAt
: null,
StuckCount: stuck.GetValueOrDefault(node))).ToList();
}
/// <summary>Counts <c>SiteCalls</c> rows matching <paramref name="predicate"/>, grouped by source site.</summary>
private async Task<Dictionary<string, int>> CountBySiteAsync(
System.Linq.Expressions.Expression<Func<SiteCall, bool>> predicate,
@@ -336,6 +391,22 @@ ORDER BY CreatedAtUtc DESC, TrackedOperationId DESC;";
.ToDictionaryAsync(x => x.Site, x => x.Count, ct);
}
/// <summary>
/// Counts <c>SiteCalls</c> rows matching <paramref name="predicate"/>, grouped by source node.
/// Only rows with a non-null <c>SourceNode</c> should be included; the predicate is
/// responsible for enforcing that guard.
/// </summary>
private async Task<Dictionary<string, int>> CountByNodeAsync(
System.Linq.Expressions.Expression<Func<SiteCall, bool>> predicate,
CancellationToken ct)
{
return await _context.SiteCalls
.Where(predicate)
.GroupBy(s => s.SourceNode!)
.Select(g => new { Node = g.Key, Count = g.Count() })
.ToDictionaryAsync(x => x.Node, x => x.Count, ct);
}
private static int GetRankOrThrow(string status)
{
if (!StatusRank.TryGetValue(status, out var rank))
@@ -122,6 +122,7 @@ public class NotificationOutboxActor : ReceiveActor, IWithTimers
Receive<DiscardNotificationRequest>(HandleDiscard);
Receive<NotificationKpiRequest>(HandleKpiRequest);
Receive<PerSiteNotificationKpiRequest>(HandlePerSiteKpiRequest);
Receive<PerNodeNotificationKpiRequest>(HandlePerNodeKpiRequest);
}
/// <inheritdoc />
@@ -1081,6 +1082,38 @@ public class NotificationOutboxActor : ReceiveActor, IWithTimers
return new PerSiteNotificationKpiResponse(correlationId, Success: true, ErrorMessage: null, sites);
}
/// <summary>
/// Handles a per-node KPI request, computing the per-source-node outbox metrics with the
/// same stuck cutoff and delivered window as <see cref="HandleKpiRequest"/>. Additive
/// alongside <see cref="HandlePerSiteKpiRequest"/> — does not change per-site behaviour.
/// </summary>
private void HandlePerNodeKpiRequest(PerNodeNotificationKpiRequest request)
{
var sender = Sender;
var now = DateTimeOffset.UtcNow;
var stuckCutoff = StuckCutoff(now);
var deliveredSince = now - _options.DeliveredKpiWindow;
ComputePerNodeKpisAsync(request.CorrelationId, stuckCutoff, deliveredSince).PipeTo(
sender,
success: response => response,
failure: ex => new PerNodeNotificationKpiResponse(
request.CorrelationId,
Success: false,
ErrorMessage: ex.GetBaseException().Message,
Nodes: Array.Empty<NodeNotificationKpiSnapshot>()));
}
private async Task<PerNodeNotificationKpiResponse> ComputePerNodeKpisAsync(
string correlationId, DateTimeOffset stuckCutoff, DateTimeOffset deliveredSince)
{
using var scope = _serviceProvider.CreateScope();
var repository = scope.ServiceProvider.GetRequiredService<INotificationOutboxRepository>();
var nodes = await repository.ComputePerNodeKpisAsync(stuckCutoff, deliveredSince);
return new PerNodeNotificationKpiResponse(correlationId, Success: true, ErrorMessage: null, nodes);
}
/// <summary>
/// The instant before which a still-pending notification counts as stuck — <paramref name="now"/>
/// offset back by <see cref="NotificationOutboxOptions.StuckAgeThreshold"/>.
@@ -239,6 +239,7 @@ public class SiteCallAuditActor : ReceiveActor
Receive<SiteCallDetailRequest>(HandleDetail);
Receive<SiteCallKpiRequest>(HandleKpi);
Receive<PerSiteSiteCallKpiRequest>(HandlePerSiteKpi);
Receive<PerNodeSiteCallKpiRequest>(HandlePerNodeKpi);
// Task 5 (#22): central→site Retry/Discard relay for parked cached calls.
Receive<RegisterCentralCommunication>(msg =>
@@ -817,6 +818,47 @@ public class SiteCallAuditActor : ReceiveActor
}
}
/// <summary>
/// Handles a per-node KPI request, using the same stuck cutoff and
/// interval bound as <see cref="HandleKpi"/>. Additive alongside
/// <see cref="HandlePerSiteKpi"/> — does not change per-site behaviour.
/// </summary>
private void HandlePerNodeKpi(PerNodeSiteCallKpiRequest request)
{
var sender = Sender;
var now = DateTime.UtcNow;
var stuckCutoff = now - _options.StuckAgeThreshold;
var intervalSince = now - _options.KpiInterval;
PerNodeKpiAsync(request.CorrelationId, stuckCutoff, intervalSince).PipeTo(
sender,
success: response => response,
failure: ex => new PerNodeSiteCallKpiResponse(
request.CorrelationId,
Success: false,
ErrorMessage: ex.GetBaseException().Message,
Nodes: Array.Empty<SiteCallNodeKpiSnapshot>()));
}
private async Task<PerNodeSiteCallKpiResponse> PerNodeKpiAsync(
string correlationId, DateTime stuckCutoff, DateTime intervalSince)
{
var (scope, repository) = ResolveRepository();
try
{
var nodes = await repository
.ComputePerNodeKpisAsync(stuckCutoff, intervalSince)
.ConfigureAwait(false);
return new PerNodeSiteCallKpiResponse(
correlationId, Success: true, ErrorMessage: null, nodes);
}
finally
{
scope?.Dispose();
}
}
// ── Task 5: central→site Retry/Discard relay ──
/// <summary>