feat(audit): M5.2 per-node stuck-count KPIs (T6) — repo per-node aggregation, actor message pair, CentralUI tiles

This commit is contained in:
Joseph Doherty
2026-06-16 21:34:14 -04:00
parent a07ff28f10
commit 209f368cb5
25 changed files with 840 additions and 6 deletions
@@ -100,6 +100,19 @@ public interface INotificationOutboxRepository
Task<IReadOnlyList<SiteNotificationKpiSnapshot>> ComputePerSiteKpisAsync(
DateTimeOffset stuckCutoff, DateTimeOffset deliveredSince, CancellationToken cancellationToken = default);
/// <summary>
/// Computes a point-in-time <see cref="NodeNotificationKpiSnapshot"/> per originating node.
/// Nodes with no notification rows at all are omitted; rows with a <c>NULL</c>
/// <c>SourceNode</c> are excluded. The stuck and delivered cutoffs are supplied by the
/// caller; the current time used for <c>OldestPendingAge</c> is captured inside the method.
/// </summary>
/// <param name="stuckCutoff">The time threshold for marking notifications as stuck.</param>
/// <param name="deliveredSince">The time threshold for counting delivered notifications.</param>
/// <param name="cancellationToken">Cancellation token.</param>
/// <returns>A list of per-node KPI snapshots, ordered by node name.</returns>
Task<IReadOnlyList<NodeNotificationKpiSnapshot>> ComputePerNodeKpisAsync(
DateTimeOffset stuckCutoff, DateTimeOffset deliveredSince, CancellationToken cancellationToken = default);
/// <summary>
/// Persists pending changes tracked on the underlying context. Use this when staging
/// multiple changes for a single commit; the individual mutating methods on this
@@ -107,4 +107,19 @@ public interface ISiteCallAuditRepository
DateTime stuckCutoff,
DateTime intervalSince,
CancellationToken ct = default);
/// <summary>
/// Computes a point-in-time <see cref="SiteCallNodeKpiSnapshot"/> per originating
/// node. Nodes with no <c>SiteCalls</c> rows at all are omitted; rows with a
/// <c>NULL</c> <c>SourceNode</c> are excluded. The stuck cutoff and interval
/// bounds are interpreted as in <see cref="ComputeKpisAsync"/>.
/// </summary>
/// <param name="stuckCutoff">UTC threshold for classifying a row as stuck.</param>
/// <param name="intervalSince">UTC start of the delivered/failed interval window.</param>
/// <param name="ct">Cancellation token.</param>
/// <returns>A task that resolves to a per-node KPI list; nodes with no rows are omitted.</returns>
Task<IReadOnlyList<SiteCallNodeKpiSnapshot>> ComputePerNodeKpisAsync(
DateTime stuckCutoff,
DateTime intervalSince,
CancellationToken ct = default);
}
@@ -164,3 +164,24 @@ public sealed record PerSiteSiteCallKpiResponse(
bool Success,
string? ErrorMessage,
IReadOnlyList<SiteCallSiteKpiSnapshot> Sites);
/// <summary>
/// Site Calls UI -> Central: request for the per-node <c>SiteCalls</c>
/// KPI breakdown. Mirrors <see cref="PerSiteSiteCallKpiRequest"/> but groups
/// by <c>SourceNode</c> instead of <c>SourceSite</c>. Additive — does not
/// change per-site behaviour.
/// </summary>
public sealed record PerNodeSiteCallKpiRequest(
string CorrelationId);
/// <summary>
/// Central -> Site Calls UI: per-node KPI breakdown for the Site Calls KPIs
/// page. On a repository fault <see cref="Success"/> is <c>false</c>,
/// <see cref="ErrorMessage"/> carries the cause, and <see cref="Nodes"/> is empty.
/// Nodes with a <c>NULL</c> <c>SourceNode</c> are omitted.
/// </summary>
public sealed record PerNodeSiteCallKpiResponse(
string CorrelationId,
bool Success,
string? ErrorMessage,
IReadOnlyList<SiteCallNodeKpiSnapshot> Nodes);
@@ -159,3 +159,23 @@ public record PerSiteNotificationKpiResponse(
bool Success,
string? ErrorMessage,
IReadOnlyList<SiteNotificationKpiSnapshot> Sites);
/// <summary>
/// Outbox UI -> Central: request for the per-node notification outbox KPI breakdown.
/// Mirrors <see cref="PerSiteNotificationKpiRequest"/> but groups by <c>SourceNode</c>
/// instead of <c>SourceSiteId</c>. Additive — does not change per-site behaviour.
/// </summary>
public record PerNodeNotificationKpiRequest(
string CorrelationId);
/// <summary>
/// Central -> Outbox UI: per-node KPI breakdown for the Notification KPIs page.
/// On a repository fault <see cref="Success"/> is <c>false</c>, <see cref="ErrorMessage"/>
/// carries the cause, and <see cref="Nodes"/> is empty. Nodes with a <c>NULL</c>
/// <c>SourceNode</c> are omitted.
/// </summary>
public record PerNodeNotificationKpiResponse(
string CorrelationId,
bool Success,
string? ErrorMessage,
IReadOnlyList<NodeNotificationKpiSnapshot> Nodes);
@@ -0,0 +1,37 @@
namespace ZB.MOM.WW.ScadaBridge.Commons.Types.Audit;
/// <summary>
/// Point-in-time <c>SiteCalls</c> metrics scoped to a single originating node. The
/// per-node counterpart of <see cref="SiteCallSiteKpiSnapshot"/>; surfaced in the
/// per-node breakdown table on the Site Calls KPIs page. Mirrors
/// <see cref="ZB.MOM.WW.ScadaBridge.Commons.Types.Notifications.NodeNotificationKpiSnapshot"/>.
/// </summary>
/// <param name="SourceNode">
/// The node identifier these metrics are scoped to (e.g. <c>node-a</c>,
/// <c>node-b</c>). Rows with a <c>NULL</c> <c>SourceNode</c> are omitted.
/// </param>
/// <param name="BufferedCount">Count of this node's non-terminal rows (<c>TerminalAtUtc IS NULL</c>).</param>
/// <param name="ParkedCount">Count of this node's rows in the <c>Parked</c> status.</param>
/// <param name="FailedLastInterval">
/// Count of this node's <c>Failed</c> rows whose <c>TerminalAtUtc</c> is at or
/// after the "since" timestamp.
/// </param>
/// <param name="DeliveredLastInterval">
/// Count of this node's <c>Delivered</c> rows whose <c>TerminalAtUtc</c> is at
/// or after the "since" timestamp.
/// </param>
/// <param name="OldestPendingAge">
/// Age of this node's oldest non-terminal row, or <c>null</c> when it has none.
/// </param>
/// <param name="StuckCount">
/// Count of this node's non-terminal rows whose <c>CreatedAtUtc</c> is older
/// than the stuck cutoff.
/// </param>
public sealed record SiteCallNodeKpiSnapshot(
string SourceNode,
int BufferedCount,
int ParkedCount,
int FailedLastInterval,
int DeliveredLastInterval,
TimeSpan? OldestPendingAge,
int StuckCount);
@@ -0,0 +1,30 @@
namespace ZB.MOM.WW.ScadaBridge.Commons.Types.Notifications;
/// <summary>
/// Point-in-time notification-outbox metrics scoped to a single originating node.
/// The per-node counterpart of <see cref="SiteNotificationKpiSnapshot"/>; surfaced
/// in the per-node breakdown table on the Notification KPIs page.
/// </summary>
/// <param name="SourceNode">
/// The node identifier these metrics are scoped to (e.g. <c>node-a</c>,
/// <c>node-b</c>). Rows with a <c>NULL</c> <c>SourceNode</c> are omitted.
/// </param>
/// <param name="QueueDepth">Count of this node's non-terminal rows (Pending + Retrying).</param>
/// <param name="StuckCount">
/// Count of this node's non-terminal rows whose <c>CreatedAt</c> is older than the stuck cutoff.
/// </param>
/// <param name="ParkedCount">Count of this node's rows in the Parked status.</param>
/// <param name="DeliveredLastInterval">
/// Count of this node's Delivered rows whose <c>DeliveredAt</c> is at or after the
/// "delivered since" timestamp.
/// </param>
/// <param name="OldestPendingAge">
/// Age of this node's oldest non-terminal row, or <c>null</c> when it has none.
/// </param>
public record NodeNotificationKpiSnapshot(
string SourceNode,
int QueueDepth,
int StuckCount,
int ParkedCount,
int DeliveredLastInterval,
TimeSpan? OldestPendingAge);