feat(kpi): K16 — Health dashboard per-site trend panel

This commit is contained in:
Joseph Doherty
2026-06-17 20:36:09 -04:00
parent 3595a41349
commit 7d7c6cbb05
2 changed files with 305 additions and 0 deletions
@@ -10,12 +10,14 @@
@using ZB.MOM.WW.ScadaBridge.Commons.Messages.Notification
@using ZB.MOM.WW.ScadaBridge.Commons.Messages.Audit
@using ZB.MOM.WW.ScadaBridge.Commons.Types.Audit
@using ZB.MOM.WW.ScadaBridge.Commons.Types.Kpi
@using ZB.MOM.WW.ScadaBridge.Communication
@implements IDisposable
@inject ICentralHealthAggregator HealthAggregator
@inject ISiteRepository SiteRepository
@inject CommunicationService CommunicationService
@inject IAuditLogQueryService AuditLogQueryService
@inject IKpiHistoryQueryService KpiHistory
<div class="container-fluid mt-3">
<div class="d-flex justify-content-between align-items-center mb-3">
@@ -76,6 +78,74 @@
IsAvailable="@_auditKpiAvailable"
ErrorMessage="@_auditKpiError" />
@* Site Health Trends (M6 K16) — per-site Site Health KPI history. Loads on a
separate path from the 10s tile-refresh timer so a trend-query fault can
never disturb the live dashboard or its polling loop. The site selector
reuses the site keys already loaded into _siteStates; the window toggle
drives the time range. Both re-query independently. *@
<div class="card mb-3" data-test="site-health-trends">
<div class="card-header d-flex justify-content-between align-items-center py-2">
<div class="d-flex align-items-center">
<h6 class="text-muted mb-0 me-3">Site Health Trends</h6>
@if (_trendSiteKeys.Count > 0)
{
<select class="form-select form-select-sm" style="width:auto"
data-test="site-health-trends-site"
value="@_trendSiteId"
@onchange="OnTrendSiteChangedAsync">
@foreach (var key in _trendSiteKeys)
{
<option value="@key">@TrendSiteLabel(key)</option>
}
</select>
}
</div>
<div class="btn-group btn-group-sm" role="group" aria-label="Trend window">
<button type="button"
class="btn @(_trendWindowHours == 24 ? "btn-primary" : "btn-outline-secondary")"
@onclick="() => SetTrendWindowAsync(24)" disabled="@_trendsLoading">24h</button>
<button type="button"
class="btn @(_trendWindowHours == 168 ? "btn-primary" : "btn-outline-secondary")"
@onclick="() => SetTrendWindowAsync(168)" disabled="@_trendsLoading">7d</button>
</div>
</div>
<div class="card-body p-3">
@if (_trendSiteKeys.Count == 0)
{
<span class="text-muted small">No sites available for trends yet.</span>
}
else
{
<div class="row g-3">
<div class="col-lg-3 col-md-6">
<KpiTrendChart Title="Connections Down"
Points="@_connectionsDownSeries"
IsAvailable="@_connectionsDownAvailable"
ErrorMessage="@_connectionsDownError" />
</div>
<div class="col-lg-3 col-md-6">
<KpiTrendChart Title="Dead Letters"
Points="@_deadLettersSeries"
IsAvailable="@_deadLettersAvailable"
ErrorMessage="@_deadLettersError" />
</div>
<div class="col-lg-3 col-md-6">
<KpiTrendChart Title="Script Errors"
Points="@_scriptErrorsSeries"
IsAvailable="@_scriptErrorsAvailable"
ErrorMessage="@_scriptErrorsError" />
</div>
<div class="col-lg-3 col-md-6">
<KpiTrendChart Title="S&amp;F Buffer Depth"
Points="@_sfBufferDepthSeries"
IsAvailable="@_sfBufferDepthAvailable"
ErrorMessage="@_sfBufferDepthError" />
</div>
</div>
}
</div>
</div>
@if (_siteStates.Count == 0)
{
<div class="alert alert-info">No site health reports received yet.</div>
@@ -387,6 +457,36 @@
Array.Empty<SiteCallNodeKpiSnapshot>();
private bool _siteCallNodeKpiAvailable;
// ── Site Health Trends (M6 K16) ───────────────────────────────────────────
// Per-site Site Health KPI history, loaded on a path entirely separate from
// the 10s tile-refresh timer (LoadSiteHealthTrendsAsync, never called from
// the timer tick). The site keys are a snapshot of the dashboard's site set,
// captured each time trends load so the selector mirrors the live cards.
// Window in hours: 24h (default) or 168h (7d). Changing the selected site OR
// the window re-queries. Each metric chart carries its own availability +
// error so one failed GetSeriesAsync degrades a single chart, never the
// dashboard.
private IReadOnlyList<string> _trendSiteKeys = Array.Empty<string>();
private string? _trendSiteId;
private int _trendWindowHours = 24;
private bool _trendsLoading;
private IReadOnlyList<KpiSeriesPoint>? _connectionsDownSeries;
private bool _connectionsDownAvailable = true;
private string? _connectionsDownError;
private IReadOnlyList<KpiSeriesPoint>? _deadLettersSeries;
private bool _deadLettersAvailable = true;
private string? _deadLettersError;
private IReadOnlyList<KpiSeriesPoint>? _scriptErrorsSeries;
private bool _scriptErrorsAvailable = true;
private string? _scriptErrorsError;
private IReadOnlyList<KpiSeriesPoint>? _sfBufferDepthSeries;
private bool _sfBufferDepthAvailable = true;
private string? _sfBufferDepthError;
private static bool SiteHasActiveErrors(SiteHealthState state)
{
var report = state.LatestReport;
@@ -410,6 +510,13 @@
}
await RefreshNow();
// Site Health Trends (M6 K16) load on their own path — never from the
// timer tick below — so a trend-query fault can't disturb the live tile
// refresh. Seed the selector from the sites just loaded into _siteStates
// and query the default site.
await LoadSiteHealthTrendsAsync();
_refreshTimer = new Timer(_ =>
{
InvokeAsync(async () =>
@@ -420,6 +527,109 @@
}, null, TimeSpan.FromSeconds(_autoRefreshSeconds), TimeSpan.FromSeconds(_autoRefreshSeconds));
}
// Re-query when the operator picks a different site. Best-effort: the load
// itself swallows faults per-chart.
private async Task OnTrendSiteChangedAsync(ChangeEventArgs e)
{
var selected = e.Value?.ToString();
if (string.IsNullOrEmpty(selected) || selected == _trendSiteId)
{
return;
}
_trendSiteId = selected;
await LoadSiteHealthTrendsAsync(refreshSiteKeys: false);
}
// Re-query when the window toggle changes (24h ↔ 7d).
private async Task SetTrendWindowAsync(int windowHours)
{
if (_trendWindowHours == windowHours)
{
return;
}
_trendWindowHours = windowHours;
await LoadSiteHealthTrendsAsync(refreshSiteKeys: false);
}
// Loads the four Site Health trend series for the selected site over the
// selected window. Deliberately decoupled from RefreshNow / the 10s timer:
// a fault here degrades the affected chart(s) only and never propagates to
// the tile-refresh loop.
//
// refreshSiteKeys re-snapshots the dashboard's site set into the selector
// (true on init); the site-change / window-toggle paths pass false so a
// mid-interaction site addition/removal can't yank the operator's choice.
private async Task LoadSiteHealthTrendsAsync(bool refreshSiteKeys = true)
{
if (refreshSiteKeys)
{
// Mirror the dashboard ordering: central cluster pinned first, then
// sites alphabetically — the same comparer the detail cards use.
_trendSiteKeys = _siteStates.Keys
.OrderBy(k => k == CentralHealthReportLoop.CentralSiteId ? 0 : 1)
.ThenBy(k => k)
.ToList();
// Default to the first site (or keep a still-valid prior selection).
if (_trendSiteId == null || !_trendSiteKeys.Contains(_trendSiteId))
{
_trendSiteId = _trendSiteKeys.FirstOrDefault();
}
}
if (string.IsNullOrEmpty(_trendSiteId))
{
return;
}
_trendsLoading = true;
try
{
var toUtc = DateTime.UtcNow;
var fromUtc = toUtc - TimeSpan.FromHours(_trendWindowHours);
var siteId = _trendSiteId;
(_connectionsDownSeries, _connectionsDownAvailable, _connectionsDownError) =
await LoadTrendSeriesAsync("connectionsDown", siteId, fromUtc, toUtc);
(_deadLettersSeries, _deadLettersAvailable, _deadLettersError) =
await LoadTrendSeriesAsync("deadLetters", siteId, fromUtc, toUtc);
(_scriptErrorsSeries, _scriptErrorsAvailable, _scriptErrorsError) =
await LoadTrendSeriesAsync("scriptErrors", siteId, fromUtc, toUtc);
(_sfBufferDepthSeries, _sfBufferDepthAvailable, _sfBufferDepthError) =
await LoadTrendSeriesAsync("sfBufferDepth", siteId, fromUtc, toUtc);
}
finally
{
_trendsLoading = false;
}
}
// Single best-effort series fetch. Site Health metrics are Site-scoped, so
// scope = KpiScopes.Site and scopeKey = the selected site id. On any fault
// the chart falls back to the unavailable placeholder — a failure here must
// NEVER break the dashboard.
private async Task<(IReadOnlyList<KpiSeriesPoint>?, bool, string?)> LoadTrendSeriesAsync(
string metric, string siteId, DateTime fromUtc, DateTime toUtc)
{
try
{
var series = await KpiHistory.GetSeriesAsync(
KpiSources.SiteHealth, metric, KpiScopes.Site, siteId, fromUtc, toUtc);
return (series, true, null);
}
catch
{
return (null, false, "Trend data unavailable.");
}
}
private string TrendSiteLabel(string siteKey) =>
siteKey == CentralHealthReportLoop.CentralSiteId
? "Central Cluster"
: $"{GetSiteName(siteKey)} ({siteKey})";
private async Task RefreshNow()
{
_siteStates = HealthAggregator.GetAllSiteStates();