feat(audit): M5.2 per-node stuck-count KPIs (T6) — repo per-node aggregation, actor message pair, CentralUI tiles
This commit is contained in:
@@ -58,3 +58,31 @@
|
||||
{
|
||||
<div class="text-muted small mb-3">Site Call KPIs unavailable: @ErrorMessage</div>
|
||||
}
|
||||
@* ── Per-node stuck/parked sub-table (T6: M5.2 per-node stuck-count KPIs) ── *@
|
||||
@if (HasNodeBreakdown)
|
||||
{
|
||||
<div class="mb-3">
|
||||
<div class="d-flex justify-content-between align-items-center mb-1">
|
||||
<small class="text-muted">By node</small>
|
||||
</div>
|
||||
<table class="table table-sm table-borderless mb-0 site-call-kpi-node-table">
|
||||
<thead class="table-light">
|
||||
<tr>
|
||||
<th class="small py-1">Node</th>
|
||||
<th class="text-end small py-1">Stuck</th>
|
||||
<th class="text-end small py-1">Parked</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
@foreach (var n in PerNodeSnapshots!)
|
||||
{
|
||||
<tr @key="n.SourceNode">
|
||||
<td class="small py-1"><code>@n.SourceNode</code></td>
|
||||
<td class="text-end font-monospace small py-1 @(n.StuckCount > 0 ? "text-warning" : "")">@n.StuckCount</td>
|
||||
<td class="text-end font-monospace small py-1 @(n.ParkedCount > 0 ? "text-danger" : "")">@n.ParkedCount</td>
|
||||
</tr>
|
||||
}
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
}
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
using Microsoft.AspNetCore.Components;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Audit;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Types.Audit;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.CentralUI.Components.Health;
|
||||
|
||||
@@ -59,6 +60,24 @@ public partial class SiteCallKpiTiles
|
||||
/// </summary>
|
||||
[Parameter] public string? ErrorMessage { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Optional per-node KPI breakdown (T6: M5.2 per-node stuck-count KPIs).
|
||||
/// When non-null and non-empty, a compact node-level stuck/parked sub-table
|
||||
/// is rendered below the main tiles. <c>null</c> means the parent has not
|
||||
/// loaded it yet or has opted out — the sub-table is suppressed entirely.
|
||||
/// </summary>
|
||||
[Parameter] public IReadOnlyList<SiteCallNodeKpiSnapshot>? PerNodeSnapshots { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// True when <see cref="PerNodeSnapshots"/> is a successful query result.
|
||||
/// Used to suppress the sub-table on a load failure.
|
||||
/// </summary>
|
||||
[Parameter] public bool PerNodeAvailable { get; set; }
|
||||
|
||||
/// <summary>Whether the per-node sub-table has data to render.</summary>
|
||||
internal bool HasNodeBreakdown =>
|
||||
PerNodeAvailable && PerNodeSnapshots is { Count: > 0 };
|
||||
|
||||
// ── Buffered tile ───────────────────────────────────────────────────────
|
||||
|
||||
private string BufferedDisplay =>
|
||||
|
||||
@@ -9,6 +9,7 @@
|
||||
@using ZB.MOM.WW.ScadaBridge.HealthMonitoring
|
||||
@using ZB.MOM.WW.ScadaBridge.Commons.Messages.Notification
|
||||
@using ZB.MOM.WW.ScadaBridge.Commons.Messages.Audit
|
||||
@using ZB.MOM.WW.ScadaBridge.Commons.Types.Audit
|
||||
@using ZB.MOM.WW.ScadaBridge.Communication
|
||||
@implements IDisposable
|
||||
@inject ICentralHealthAggregator HealthAggregator
|
||||
@@ -65,7 +66,9 @@
|
||||
(buffered / stuck / parked). Refreshed alongside the site states. *@
|
||||
<SiteCallKpiTiles Snapshot="@_siteCallKpi"
|
||||
IsAvailable="@_siteCallKpiAvailable"
|
||||
ErrorMessage="@_siteCallKpiError" />
|
||||
ErrorMessage="@_siteCallKpiError"
|
||||
PerNodeSnapshots="@_siteCallNodeKpis"
|
||||
PerNodeAvailable="@_siteCallNodeKpiAvailable" />
|
||||
|
||||
@* Audit Log (#23) M7 Bundle E — three KPI tiles for the Audit channel
|
||||
(volume / error rate / backlog). Refreshed alongside the site states. *@
|
||||
@@ -378,6 +381,12 @@
|
||||
private bool _siteCallKpiAvailable;
|
||||
private string? _siteCallKpiError;
|
||||
|
||||
// Per-node Site Call KPI breakdown (T6: M5.2 per-node stuck-count KPIs).
|
||||
// Passed to SiteCallKpiTiles as an optional sub-table.
|
||||
private IReadOnlyList<SiteCallNodeKpiSnapshot> _siteCallNodeKpis =
|
||||
Array.Empty<SiteCallNodeKpiSnapshot>();
|
||||
private bool _siteCallNodeKpiAvailable;
|
||||
|
||||
private static bool SiteHasActiveErrors(SiteHealthState state)
|
||||
{
|
||||
var report = state.LatestReport;
|
||||
@@ -415,7 +424,7 @@
|
||||
{
|
||||
_siteStates = HealthAggregator.GetAllSiteStates();
|
||||
await LoadOutboxKpis();
|
||||
await LoadSiteCallKpis();
|
||||
await Task.WhenAll(LoadSiteCallKpis(), LoadSiteCallNodeKpis());
|
||||
await LoadAuditKpis();
|
||||
}
|
||||
|
||||
@@ -474,6 +483,30 @@
|
||||
}
|
||||
}
|
||||
|
||||
// Per-node site-call KPI loader (T6: M5.2). Best-effort; a fault silently
|
||||
// suppresses the per-node sub-table rather than degrading the dashboard.
|
||||
private async Task LoadSiteCallNodeKpis()
|
||||
{
|
||||
try
|
||||
{
|
||||
var response = await CommunicationService.GetPerNodeSiteCallKpisAsync(
|
||||
new PerNodeSiteCallKpiRequest(Guid.NewGuid().ToString("N")));
|
||||
if (response.Success)
|
||||
{
|
||||
_siteCallNodeKpis = response.Nodes;
|
||||
_siteCallNodeKpiAvailable = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
_siteCallNodeKpiAvailable = false;
|
||||
}
|
||||
}
|
||||
catch
|
||||
{
|
||||
_siteCallNodeKpiAvailable = false;
|
||||
}
|
||||
}
|
||||
|
||||
// Tiles show the numeric KPI when available, or an em dash when the outbox
|
||||
// KPI query failed — matching how the page renders other unavailable data.
|
||||
private string OutboxTileValue(int value) =>
|
||||
|
||||
+73
-2
@@ -69,6 +69,51 @@
|
||||
</div>
|
||||
}
|
||||
|
||||
@* ── Per-node breakdown (T6: additive) ── *@
|
||||
<h5 class="mb-2">Per-node breakdown</h5>
|
||||
@if (_perNodeError != null)
|
||||
{
|
||||
<div class="alert alert-warning py-2">Per-node KPIs unavailable: @_perNodeError</div>
|
||||
}
|
||||
else if (_perNode.Count == 0)
|
||||
{
|
||||
<div class="card mb-3">
|
||||
<div class="card-body text-center text-muted py-3">
|
||||
<div class="small">No per-node activity (rows may have a null SourceNode).</div>
|
||||
</div>
|
||||
</div>
|
||||
}
|
||||
else
|
||||
{
|
||||
<div class="table-responsive mb-3">
|
||||
<table class="table table-sm table-hover align-middle">
|
||||
<thead class="table-light">
|
||||
<tr>
|
||||
<th>Node</th>
|
||||
<th class="text-end">Queue Depth</th>
|
||||
<th class="text-end">Stuck</th>
|
||||
<th class="text-end">Parked</th>
|
||||
<th class="text-end">Delivered (last interval)</th>
|
||||
<th class="text-end">Oldest Pending Age</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
@foreach (var n in _perNode)
|
||||
{
|
||||
<tr @key="n.SourceNode" class="@(n.StuckCount > 0 ? "table-warning" : "")">
|
||||
<td><code>@n.SourceNode</code></td>
|
||||
<td class="text-end font-monospace">@n.QueueDepth</td>
|
||||
<td class="text-end font-monospace @(n.StuckCount > 0 ? "text-warning" : "")">@n.StuckCount</td>
|
||||
<td class="text-end font-monospace @(n.ParkedCount > 0 ? "text-danger" : "")">@n.ParkedCount</td>
|
||||
<td class="text-end font-monospace text-success">@n.DeliveredLastInterval</td>
|
||||
<td class="text-end font-monospace">@FormatAge(n.OldestPendingAge)</td>
|
||||
</tr>
|
||||
}
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
}
|
||||
|
||||
@* ── Per-site breakdown ── *@
|
||||
<h5 class="mb-2">Per-site breakdown</h5>
|
||||
@if (_perSiteError != null)
|
||||
@@ -124,6 +169,10 @@
|
||||
private IReadOnlyList<SiteNotificationKpiSnapshot> _perSite = Array.Empty<SiteNotificationKpiSnapshot>();
|
||||
private string? _perSiteError;
|
||||
|
||||
// ── Per-node (T6: M5.2 per-node stuck-count KPIs) ──
|
||||
private IReadOnlyList<NodeNotificationKpiSnapshot> _perNode = Array.Empty<NodeNotificationKpiSnapshot>();
|
||||
private string? _perNodeError;
|
||||
|
||||
private bool _loading;
|
||||
|
||||
protected override async Task OnInitializedAsync()
|
||||
@@ -144,9 +193,9 @@
|
||||
private async Task RefreshAll()
|
||||
{
|
||||
_loading = true;
|
||||
// Race-free despite both tasks mutating component fields: Blazor Server runs
|
||||
// Race-free despite all tasks mutating component fields: Blazor Server runs
|
||||
// every continuation on the circuit's single-threaded synchronization context.
|
||||
await Task.WhenAll(LoadGlobalKpis(), LoadPerSiteKpis());
|
||||
await Task.WhenAll(LoadGlobalKpis(), LoadPerSiteKpis(), LoadPerNodeKpis());
|
||||
_loading = false;
|
||||
}
|
||||
|
||||
@@ -194,6 +243,28 @@
|
||||
}
|
||||
}
|
||||
|
||||
private async Task LoadPerNodeKpis()
|
||||
{
|
||||
try
|
||||
{
|
||||
var response = await CommunicationService.GetPerNodeNotificationKpisAsync(
|
||||
new PerNodeNotificationKpiRequest(Guid.NewGuid().ToString("N")));
|
||||
if (response.Success)
|
||||
{
|
||||
_perNode = response.Nodes;
|
||||
_perNodeError = null;
|
||||
}
|
||||
else
|
||||
{
|
||||
_perNodeError = response.ErrorMessage ?? "Per-node KPI query failed.";
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_perNodeError = $"Per-node KPI query failed: {ex.Message}";
|
||||
}
|
||||
}
|
||||
|
||||
private string SiteName(string siteId) =>
|
||||
_sites.FirstOrDefault(s => s.SiteIdentifier == siteId)?.Name ?? siteId;
|
||||
|
||||
|
||||
+13
@@ -100,6 +100,19 @@ public interface INotificationOutboxRepository
|
||||
Task<IReadOnlyList<SiteNotificationKpiSnapshot>> ComputePerSiteKpisAsync(
|
||||
DateTimeOffset stuckCutoff, DateTimeOffset deliveredSince, CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Computes a point-in-time <see cref="NodeNotificationKpiSnapshot"/> per originating node.
|
||||
/// Nodes with no notification rows at all are omitted; rows with a <c>NULL</c>
|
||||
/// <c>SourceNode</c> are excluded. The stuck and delivered cutoffs are supplied by the
|
||||
/// caller; the current time used for <c>OldestPendingAge</c> is captured inside the method.
|
||||
/// </summary>
|
||||
/// <param name="stuckCutoff">The time threshold for marking notifications as stuck.</param>
|
||||
/// <param name="deliveredSince">The time threshold for counting delivered notifications.</param>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
/// <returns>A list of per-node KPI snapshots, ordered by node name.</returns>
|
||||
Task<IReadOnlyList<NodeNotificationKpiSnapshot>> ComputePerNodeKpisAsync(
|
||||
DateTimeOffset stuckCutoff, DateTimeOffset deliveredSince, CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Persists pending changes tracked on the underlying context. Use this when staging
|
||||
/// multiple changes for a single commit; the individual mutating methods on this
|
||||
|
||||
@@ -107,4 +107,19 @@ public interface ISiteCallAuditRepository
|
||||
DateTime stuckCutoff,
|
||||
DateTime intervalSince,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Computes a point-in-time <see cref="SiteCallNodeKpiSnapshot"/> per originating
|
||||
/// node. Nodes with no <c>SiteCalls</c> rows at all are omitted; rows with a
|
||||
/// <c>NULL</c> <c>SourceNode</c> are excluded. The stuck cutoff and interval
|
||||
/// bounds are interpreted as in <see cref="ComputeKpisAsync"/>.
|
||||
/// </summary>
|
||||
/// <param name="stuckCutoff">UTC threshold for classifying a row as stuck.</param>
|
||||
/// <param name="intervalSince">UTC start of the delivered/failed interval window.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>A task that resolves to a per-node KPI list; nodes with no rows are omitted.</returns>
|
||||
Task<IReadOnlyList<SiteCallNodeKpiSnapshot>> ComputePerNodeKpisAsync(
|
||||
DateTime stuckCutoff,
|
||||
DateTime intervalSince,
|
||||
CancellationToken ct = default);
|
||||
}
|
||||
|
||||
@@ -164,3 +164,24 @@ public sealed record PerSiteSiteCallKpiResponse(
|
||||
bool Success,
|
||||
string? ErrorMessage,
|
||||
IReadOnlyList<SiteCallSiteKpiSnapshot> Sites);
|
||||
|
||||
/// <summary>
|
||||
/// Site Calls UI -> Central: request for the per-node <c>SiteCalls</c>
|
||||
/// KPI breakdown. Mirrors <see cref="PerSiteSiteCallKpiRequest"/> but groups
|
||||
/// by <c>SourceNode</c> instead of <c>SourceSite</c>. Additive — does not
|
||||
/// change per-site behaviour.
|
||||
/// </summary>
|
||||
public sealed record PerNodeSiteCallKpiRequest(
|
||||
string CorrelationId);
|
||||
|
||||
/// <summary>
|
||||
/// Central -> Site Calls UI: per-node KPI breakdown for the Site Calls KPIs
|
||||
/// page. On a repository fault <see cref="Success"/> is <c>false</c>,
|
||||
/// <see cref="ErrorMessage"/> carries the cause, and <see cref="Nodes"/> is empty.
|
||||
/// Nodes with a <c>NULL</c> <c>SourceNode</c> are omitted.
|
||||
/// </summary>
|
||||
public sealed record PerNodeSiteCallKpiResponse(
|
||||
string CorrelationId,
|
||||
bool Success,
|
||||
string? ErrorMessage,
|
||||
IReadOnlyList<SiteCallNodeKpiSnapshot> Nodes);
|
||||
|
||||
@@ -159,3 +159,23 @@ public record PerSiteNotificationKpiResponse(
|
||||
bool Success,
|
||||
string? ErrorMessage,
|
||||
IReadOnlyList<SiteNotificationKpiSnapshot> Sites);
|
||||
|
||||
/// <summary>
|
||||
/// Outbox UI -> Central: request for the per-node notification outbox KPI breakdown.
|
||||
/// Mirrors <see cref="PerSiteNotificationKpiRequest"/> but groups by <c>SourceNode</c>
|
||||
/// instead of <c>SourceSiteId</c>. Additive — does not change per-site behaviour.
|
||||
/// </summary>
|
||||
public record PerNodeNotificationKpiRequest(
|
||||
string CorrelationId);
|
||||
|
||||
/// <summary>
|
||||
/// Central -> Outbox UI: per-node KPI breakdown for the Notification KPIs page.
|
||||
/// On a repository fault <see cref="Success"/> is <c>false</c>, <see cref="ErrorMessage"/>
|
||||
/// carries the cause, and <see cref="Nodes"/> is empty. Nodes with a <c>NULL</c>
|
||||
/// <c>SourceNode</c> are omitted.
|
||||
/// </summary>
|
||||
public record PerNodeNotificationKpiResponse(
|
||||
string CorrelationId,
|
||||
bool Success,
|
||||
string? ErrorMessage,
|
||||
IReadOnlyList<NodeNotificationKpiSnapshot> Nodes);
|
||||
|
||||
@@ -0,0 +1,37 @@
|
||||
namespace ZB.MOM.WW.ScadaBridge.Commons.Types.Audit;
|
||||
|
||||
/// <summary>
|
||||
/// Point-in-time <c>SiteCalls</c> metrics scoped to a single originating node. The
|
||||
/// per-node counterpart of <see cref="SiteCallSiteKpiSnapshot"/>; surfaced in the
|
||||
/// per-node breakdown table on the Site Calls KPIs page. Mirrors
|
||||
/// <see cref="ZB.MOM.WW.ScadaBridge.Commons.Types.Notifications.NodeNotificationKpiSnapshot"/>.
|
||||
/// </summary>
|
||||
/// <param name="SourceNode">
|
||||
/// The node identifier these metrics are scoped to (e.g. <c>node-a</c>,
|
||||
/// <c>node-b</c>). Rows with a <c>NULL</c> <c>SourceNode</c> are omitted.
|
||||
/// </param>
|
||||
/// <param name="BufferedCount">Count of this node's non-terminal rows (<c>TerminalAtUtc IS NULL</c>).</param>
|
||||
/// <param name="ParkedCount">Count of this node's rows in the <c>Parked</c> status.</param>
|
||||
/// <param name="FailedLastInterval">
|
||||
/// Count of this node's <c>Failed</c> rows whose <c>TerminalAtUtc</c> is at or
|
||||
/// after the "since" timestamp.
|
||||
/// </param>
|
||||
/// <param name="DeliveredLastInterval">
|
||||
/// Count of this node's <c>Delivered</c> rows whose <c>TerminalAtUtc</c> is at
|
||||
/// or after the "since" timestamp.
|
||||
/// </param>
|
||||
/// <param name="OldestPendingAge">
|
||||
/// Age of this node's oldest non-terminal row, or <c>null</c> when it has none.
|
||||
/// </param>
|
||||
/// <param name="StuckCount">
|
||||
/// Count of this node's non-terminal rows whose <c>CreatedAtUtc</c> is older
|
||||
/// than the stuck cutoff.
|
||||
/// </param>
|
||||
public sealed record SiteCallNodeKpiSnapshot(
|
||||
string SourceNode,
|
||||
int BufferedCount,
|
||||
int ParkedCount,
|
||||
int FailedLastInterval,
|
||||
int DeliveredLastInterval,
|
||||
TimeSpan? OldestPendingAge,
|
||||
int StuckCount);
|
||||
@@ -0,0 +1,30 @@
|
||||
namespace ZB.MOM.WW.ScadaBridge.Commons.Types.Notifications;
|
||||
|
||||
/// <summary>
|
||||
/// Point-in-time notification-outbox metrics scoped to a single originating node.
|
||||
/// The per-node counterpart of <see cref="SiteNotificationKpiSnapshot"/>; surfaced
|
||||
/// in the per-node breakdown table on the Notification KPIs page.
|
||||
/// </summary>
|
||||
/// <param name="SourceNode">
|
||||
/// The node identifier these metrics are scoped to (e.g. <c>node-a</c>,
|
||||
/// <c>node-b</c>). Rows with a <c>NULL</c> <c>SourceNode</c> are omitted.
|
||||
/// </param>
|
||||
/// <param name="QueueDepth">Count of this node's non-terminal rows (Pending + Retrying).</param>
|
||||
/// <param name="StuckCount">
|
||||
/// Count of this node's non-terminal rows whose <c>CreatedAt</c> is older than the stuck cutoff.
|
||||
/// </param>
|
||||
/// <param name="ParkedCount">Count of this node's rows in the Parked status.</param>
|
||||
/// <param name="DeliveredLastInterval">
|
||||
/// Count of this node's Delivered rows whose <c>DeliveredAt</c> is at or after the
|
||||
/// "delivered since" timestamp.
|
||||
/// </param>
|
||||
/// <param name="OldestPendingAge">
|
||||
/// Age of this node's oldest non-terminal row, or <c>null</c> when it has none.
|
||||
/// </param>
|
||||
public record NodeNotificationKpiSnapshot(
|
||||
string SourceNode,
|
||||
int QueueDepth,
|
||||
int StuckCount,
|
||||
int ParkedCount,
|
||||
int DeliveredLastInterval,
|
||||
TimeSpan? OldestPendingAge);
|
||||
@@ -525,6 +525,22 @@ public class CommunicationService
|
||||
request, _options.QueryTimeout, cancellationToken);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets per-node KPI metrics for the notification outbox.
|
||||
/// Groups by <c>SourceNode</c> (e.g. <c>node-a</c>/<c>node-b</c>); rows with
|
||||
/// a <c>NULL</c> node are omitted. Additive alongside
|
||||
/// <see cref="GetPerSiteNotificationKpisAsync"/>.
|
||||
/// </summary>
|
||||
/// <param name="request">The per-node notification KPI request.</param>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
/// <returns>The per-node notification KPI response.</returns>
|
||||
public async Task<PerNodeNotificationKpiResponse> GetPerNodeNotificationKpisAsync(
|
||||
PerNodeNotificationKpiRequest request, CancellationToken cancellationToken = default)
|
||||
{
|
||||
return await GetNotificationOutbox().Ask<PerNodeNotificationKpiResponse>(
|
||||
request, _options.QueryTimeout, cancellationToken);
|
||||
}
|
||||
|
||||
// ── Site Call Audit (central-local actor — Asked directly, no SiteEnvelope) ──
|
||||
|
||||
/// <summary>
|
||||
@@ -579,6 +595,21 @@ public class CommunicationService
|
||||
request, _options.QueryTimeout, cancellationToken);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets per-node KPI metrics for site calls. Groups by <c>SourceNode</c>
|
||||
/// (e.g. <c>node-a</c>/<c>node-b</c>); rows with a <c>NULL</c> node are
|
||||
/// omitted. Additive alongside <see cref="GetPerSiteSiteCallKpisAsync"/>.
|
||||
/// </summary>
|
||||
/// <param name="request">The per-node site call KPI request.</param>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
/// <returns>The per-node site call KPI response.</returns>
|
||||
public async Task<PerNodeSiteCallKpiResponse> GetPerNodeSiteCallKpisAsync(
|
||||
PerNodeSiteCallKpiRequest request, CancellationToken cancellationToken = default)
|
||||
{
|
||||
return await GetSiteCallAudit().Ask<PerNodeSiteCallKpiResponse>(
|
||||
request, _options.QueryTimeout, cancellationToken);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Task 5 (#22): relays an operator Retry of a parked cached call to its
|
||||
/// owning site. The <c>SiteCallAuditActor</c> is Asked directly (it is
|
||||
|
||||
+73
@@ -300,6 +300,63 @@ VALUES
|
||||
: null)).ToList();
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<IReadOnlyList<NodeNotificationKpiSnapshot>> ComputePerNodeKpisAsync(
|
||||
DateTimeOffset stuckCutoff, DateTimeOffset deliveredSince, CancellationToken cancellationToken = default)
|
||||
{
|
||||
var now = DateTimeOffset.UtcNow;
|
||||
|
||||
// Exclude rows with NULL SourceNode (legacy / unstamped) — per-node KPIs
|
||||
// are only meaningful when the node identity is known.
|
||||
var queueDepth = await CountByNodeAsync(
|
||||
n => (n.Status == NotificationStatus.Pending || n.Status == NotificationStatus.Retrying)
|
||||
&& n.SourceNode != null,
|
||||
cancellationToken);
|
||||
|
||||
var stuck = await CountByNodeAsync(
|
||||
n => (n.Status == NotificationStatus.Pending || n.Status == NotificationStatus.Retrying)
|
||||
&& n.CreatedAt < stuckCutoff
|
||||
&& n.SourceNode != null,
|
||||
cancellationToken);
|
||||
|
||||
var parked = await CountByNodeAsync(
|
||||
n => n.Status == NotificationStatus.Parked && n.SourceNode != null,
|
||||
cancellationToken);
|
||||
|
||||
var delivered = await CountByNodeAsync(
|
||||
n => n.Status == NotificationStatus.Delivered
|
||||
&& n.DeliveredAt != null && n.DeliveredAt >= deliveredSince
|
||||
&& n.SourceNode != null,
|
||||
cancellationToken);
|
||||
|
||||
// Oldest non-terminal CreatedAt per node — same in-memory reduction
|
||||
// pattern as ComputePerSiteKpisAsync (DateTimeOffset converter makes
|
||||
// a SQL Min awkward).
|
||||
var oldest = (await _context.Notifications
|
||||
.Where(n => (n.Status == NotificationStatus.Pending
|
||||
|| n.Status == NotificationStatus.Retrying)
|
||||
&& n.SourceNode != null)
|
||||
.Select(n => new { n.SourceNode, n.CreatedAt })
|
||||
.ToListAsync(cancellationToken))
|
||||
.GroupBy(x => x.SourceNode!)
|
||||
.ToDictionary(g => g.Key, g => g.Min(x => x.CreatedAt));
|
||||
|
||||
var nodeNames = queueDepth.Keys
|
||||
.Concat(stuck.Keys).Concat(parked.Keys).Concat(delivered.Keys)
|
||||
.Distinct()
|
||||
.OrderBy(n => n, StringComparer.Ordinal);
|
||||
|
||||
return nodeNames.Select(node => new NodeNotificationKpiSnapshot(
|
||||
SourceNode: node,
|
||||
QueueDepth: queueDepth.GetValueOrDefault(node),
|
||||
StuckCount: stuck.GetValueOrDefault(node),
|
||||
ParkedCount: parked.GetValueOrDefault(node),
|
||||
DeliveredLastInterval: delivered.GetValueOrDefault(node),
|
||||
OldestPendingAge: oldest.TryGetValue(node, out var createdAt)
|
||||
? now - createdAt
|
||||
: null)).ToList();
|
||||
}
|
||||
|
||||
/// <summary>Counts notification rows matching <paramref name="predicate"/>, grouped by source site.</summary>
|
||||
private async Task<Dictionary<string, int>> CountBySiteAsync(
|
||||
System.Linq.Expressions.Expression<Func<Notification, bool>> predicate,
|
||||
@@ -312,6 +369,22 @@ VALUES
|
||||
.ToDictionaryAsync(x => x.Site, x => x.Count, cancellationToken);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Counts notification rows matching <paramref name="predicate"/>, grouped by source node.
|
||||
/// Only rows with a non-null <c>SourceNode</c> should be included; the predicate is
|
||||
/// responsible for enforcing that guard.
|
||||
/// </summary>
|
||||
private async Task<Dictionary<string, int>> CountByNodeAsync(
|
||||
System.Linq.Expressions.Expression<Func<Notification, bool>> predicate,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
return await _context.Notifications
|
||||
.Where(predicate)
|
||||
.GroupBy(n => n.SourceNode!)
|
||||
.Select(g => new { Node = g.Key, Count = g.Count() })
|
||||
.ToDictionaryAsync(x => x.Node, x => x.Count, cancellationToken);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<int> SaveChangesAsync(CancellationToken cancellationToken = default)
|
||||
=> await _context.SaveChangesAsync(cancellationToken);
|
||||
|
||||
+71
@@ -324,6 +324,61 @@ ORDER BY CreatedAtUtc DESC, TrackedOperationId DESC;";
|
||||
StuckCount: stuck.GetValueOrDefault(site))).ToList();
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<IReadOnlyList<SiteCallNodeKpiSnapshot>> ComputePerNodeKpisAsync(
|
||||
DateTime stuckCutoff, DateTime intervalSince, CancellationToken ct = default)
|
||||
{
|
||||
var now = DateTime.UtcNow;
|
||||
|
||||
// Exclude rows with NULL SourceNode — per-node KPIs are only meaningful
|
||||
// when the node identity is known. Each predicate guards n.SourceNode != null
|
||||
// so the GROUP BY key is always non-null.
|
||||
var buffered = await CountByNodeAsync(
|
||||
s => s.TerminalAtUtc == null && s.SourceNode != null, ct);
|
||||
|
||||
var parked = await CountByNodeAsync(
|
||||
s => s.Status == StatusParked && s.SourceNode != null, ct);
|
||||
|
||||
var failed = await CountByNodeAsync(
|
||||
s => s.Status == StatusFailed
|
||||
&& s.TerminalAtUtc != null && s.TerminalAtUtc >= intervalSince
|
||||
&& s.SourceNode != null, ct);
|
||||
|
||||
var delivered = await CountByNodeAsync(
|
||||
s => s.Status == StatusDelivered
|
||||
&& s.TerminalAtUtc != null && s.TerminalAtUtc >= intervalSince
|
||||
&& s.SourceNode != null, ct);
|
||||
|
||||
var stuck = await CountByNodeAsync(
|
||||
s => s.TerminalAtUtc == null && s.CreatedAtUtc < stuckCutoff
|
||||
&& s.SourceNode != null, ct);
|
||||
|
||||
// Oldest non-terminal CreatedAtUtc per node — server-side GROUP BY MIN.
|
||||
var oldest = (await _context.SiteCalls
|
||||
.Where(s => s.TerminalAtUtc == null && s.SourceNode != null)
|
||||
.GroupBy(s => s.SourceNode!)
|
||||
.Select(g => new { Node = g.Key, Oldest = g.Min(s => s.CreatedAtUtc) })
|
||||
.ToListAsync(ct))
|
||||
.ToDictionary(x => x.Node, x => x.Oldest);
|
||||
|
||||
var nodeNames = buffered.Keys
|
||||
.Concat(parked.Keys).Concat(failed.Keys)
|
||||
.Concat(delivered.Keys).Concat(stuck.Keys)
|
||||
.Distinct()
|
||||
.OrderBy(n => n, StringComparer.Ordinal);
|
||||
|
||||
return nodeNames.Select(node => new SiteCallNodeKpiSnapshot(
|
||||
SourceNode: node,
|
||||
BufferedCount: buffered.GetValueOrDefault(node),
|
||||
ParkedCount: parked.GetValueOrDefault(node),
|
||||
FailedLastInterval: failed.GetValueOrDefault(node),
|
||||
DeliveredLastInterval: delivered.GetValueOrDefault(node),
|
||||
OldestPendingAge: oldest.TryGetValue(node, out var createdAt)
|
||||
? now - createdAt
|
||||
: null,
|
||||
StuckCount: stuck.GetValueOrDefault(node))).ToList();
|
||||
}
|
||||
|
||||
/// <summary>Counts <c>SiteCalls</c> rows matching <paramref name="predicate"/>, grouped by source site.</summary>
|
||||
private async Task<Dictionary<string, int>> CountBySiteAsync(
|
||||
System.Linq.Expressions.Expression<Func<SiteCall, bool>> predicate,
|
||||
@@ -336,6 +391,22 @@ ORDER BY CreatedAtUtc DESC, TrackedOperationId DESC;";
|
||||
.ToDictionaryAsync(x => x.Site, x => x.Count, ct);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Counts <c>SiteCalls</c> rows matching <paramref name="predicate"/>, grouped by source node.
|
||||
/// Only rows with a non-null <c>SourceNode</c> should be included; the predicate is
|
||||
/// responsible for enforcing that guard.
|
||||
/// </summary>
|
||||
private async Task<Dictionary<string, int>> CountByNodeAsync(
|
||||
System.Linq.Expressions.Expression<Func<SiteCall, bool>> predicate,
|
||||
CancellationToken ct)
|
||||
{
|
||||
return await _context.SiteCalls
|
||||
.Where(predicate)
|
||||
.GroupBy(s => s.SourceNode!)
|
||||
.Select(g => new { Node = g.Key, Count = g.Count() })
|
||||
.ToDictionaryAsync(x => x.Node, x => x.Count, ct);
|
||||
}
|
||||
|
||||
private static int GetRankOrThrow(string status)
|
||||
{
|
||||
if (!StatusRank.TryGetValue(status, out var rank))
|
||||
|
||||
@@ -122,6 +122,7 @@ public class NotificationOutboxActor : ReceiveActor, IWithTimers
|
||||
Receive<DiscardNotificationRequest>(HandleDiscard);
|
||||
Receive<NotificationKpiRequest>(HandleKpiRequest);
|
||||
Receive<PerSiteNotificationKpiRequest>(HandlePerSiteKpiRequest);
|
||||
Receive<PerNodeNotificationKpiRequest>(HandlePerNodeKpiRequest);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
@@ -1081,6 +1082,38 @@ public class NotificationOutboxActor : ReceiveActor, IWithTimers
|
||||
return new PerSiteNotificationKpiResponse(correlationId, Success: true, ErrorMessage: null, sites);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Handles a per-node KPI request, computing the per-source-node outbox metrics with the
|
||||
/// same stuck cutoff and delivered window as <see cref="HandleKpiRequest"/>. Additive
|
||||
/// alongside <see cref="HandlePerSiteKpiRequest"/> — does not change per-site behaviour.
|
||||
/// </summary>
|
||||
private void HandlePerNodeKpiRequest(PerNodeNotificationKpiRequest request)
|
||||
{
|
||||
var sender = Sender;
|
||||
var now = DateTimeOffset.UtcNow;
|
||||
var stuckCutoff = StuckCutoff(now);
|
||||
var deliveredSince = now - _options.DeliveredKpiWindow;
|
||||
|
||||
ComputePerNodeKpisAsync(request.CorrelationId, stuckCutoff, deliveredSince).PipeTo(
|
||||
sender,
|
||||
success: response => response,
|
||||
failure: ex => new PerNodeNotificationKpiResponse(
|
||||
request.CorrelationId,
|
||||
Success: false,
|
||||
ErrorMessage: ex.GetBaseException().Message,
|
||||
Nodes: Array.Empty<NodeNotificationKpiSnapshot>()));
|
||||
}
|
||||
|
||||
private async Task<PerNodeNotificationKpiResponse> ComputePerNodeKpisAsync(
|
||||
string correlationId, DateTimeOffset stuckCutoff, DateTimeOffset deliveredSince)
|
||||
{
|
||||
using var scope = _serviceProvider.CreateScope();
|
||||
var repository = scope.ServiceProvider.GetRequiredService<INotificationOutboxRepository>();
|
||||
var nodes = await repository.ComputePerNodeKpisAsync(stuckCutoff, deliveredSince);
|
||||
|
||||
return new PerNodeNotificationKpiResponse(correlationId, Success: true, ErrorMessage: null, nodes);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// The instant before which a still-pending notification counts as stuck — <paramref name="now"/>
|
||||
/// offset back by <see cref="NotificationOutboxOptions.StuckAgeThreshold"/>.
|
||||
|
||||
@@ -239,6 +239,7 @@ public class SiteCallAuditActor : ReceiveActor
|
||||
Receive<SiteCallDetailRequest>(HandleDetail);
|
||||
Receive<SiteCallKpiRequest>(HandleKpi);
|
||||
Receive<PerSiteSiteCallKpiRequest>(HandlePerSiteKpi);
|
||||
Receive<PerNodeSiteCallKpiRequest>(HandlePerNodeKpi);
|
||||
|
||||
// Task 5 (#22): central→site Retry/Discard relay for parked cached calls.
|
||||
Receive<RegisterCentralCommunication>(msg =>
|
||||
@@ -817,6 +818,47 @@ public class SiteCallAuditActor : ReceiveActor
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Handles a per-node KPI request, using the same stuck cutoff and
|
||||
/// interval bound as <see cref="HandleKpi"/>. Additive alongside
|
||||
/// <see cref="HandlePerSiteKpi"/> — does not change per-site behaviour.
|
||||
/// </summary>
|
||||
private void HandlePerNodeKpi(PerNodeSiteCallKpiRequest request)
|
||||
{
|
||||
var sender = Sender;
|
||||
var now = DateTime.UtcNow;
|
||||
var stuckCutoff = now - _options.StuckAgeThreshold;
|
||||
var intervalSince = now - _options.KpiInterval;
|
||||
|
||||
PerNodeKpiAsync(request.CorrelationId, stuckCutoff, intervalSince).PipeTo(
|
||||
sender,
|
||||
success: response => response,
|
||||
failure: ex => new PerNodeSiteCallKpiResponse(
|
||||
request.CorrelationId,
|
||||
Success: false,
|
||||
ErrorMessage: ex.GetBaseException().Message,
|
||||
Nodes: Array.Empty<SiteCallNodeKpiSnapshot>()));
|
||||
}
|
||||
|
||||
private async Task<PerNodeSiteCallKpiResponse> PerNodeKpiAsync(
|
||||
string correlationId, DateTime stuckCutoff, DateTime intervalSince)
|
||||
{
|
||||
var (scope, repository) = ResolveRepository();
|
||||
try
|
||||
{
|
||||
var nodes = await repository
|
||||
.ComputePerNodeKpisAsync(stuckCutoff, intervalSince)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
return new PerNodeSiteCallKpiResponse(
|
||||
correlationId, Success: true, ErrorMessage: null, nodes);
|
||||
}
|
||||
finally
|
||||
{
|
||||
scope?.Dispose();
|
||||
}
|
||||
}
|
||||
|
||||
// ── Task 5: central→site Retry/Discard relay ──
|
||||
|
||||
/// <summary>
|
||||
|
||||
Reference in New Issue
Block a user