perf(ui): topology page — staleness off the live loop + parallelized one-shot

This commit is contained in:
Joseph Doherty
2026-06-26 16:53:44 -04:00
parent eb59c4244f
commit 99254b71de
2 changed files with 210 additions and 15 deletions
@@ -9,6 +9,8 @@
@using ZB.MOM.WW.ScadaBridge.Commons.Types.Enums
@using ZB.MOM.WW.ScadaBridge.DeploymentManager
@using ZB.MOM.WW.ScadaBridge.TemplateEngine.Services
@using System.Collections.Concurrent
@using Microsoft.Extensions.DependencyInjection
@attribute [Authorize(Policy = AuthorizationPolicies.RequireDeployment)]
@inject ITemplateEngineRepository TemplateEngineRepository
@inject ISiteRepository SiteRepository
@@ -21,6 +23,7 @@
@inject NavigationManager NavigationManager
@inject IJSRuntime JSRuntime
@inject IDialogService Dialog
@inject IServiceScopeFactory ScopeFactory
@implements IDisposable
<div class="container-fluid mt-3">
@@ -77,6 +80,19 @@
<button class="btn btn-outline-secondary" aria-label="Expand all areas" @onclick="() => _tree?.ExpandAll()">Expand</button>
<button class="btn btn-outline-secondary" aria-label="Collapse all areas" @onclick="() => _tree?.CollapseAll()">Collapse</button>
</div>
<button class="btn btn-outline-secondary btn-sm" aria-label="Re-check staleness"
title="Recompute the Stale/Current badge for every deployed instance"
@onclick="RecheckStalenessAsync" disabled="@_recheckingStaleness">
@if (_recheckingStaleness)
{
<span class="spinner-border spinner-border-sm me-1" role="status" aria-hidden="true"></span>
<span>Checking…</span>
}
else
{
<span><i class="bi bi-arrow-repeat me-1"></i>Re-check staleness</span>
}
</button>
<div class="form-check form-switch ms-2 mb-0">
<input type="checkbox" class="form-check-input" id="live-updates"
checked="@_liveUpdates" @onchange="OnLiveUpdatesToggled" />
@@ -121,9 +137,18 @@
private List<Area> _allAreas = new();
private Dictionary<int, bool> _stalenessMap = new();
/// <summary>
/// Max concurrent per-instance staleness comparisons. Each comparison
/// re-flattens the instance's config, so the fan-out is capped to avoid a
/// thundering herd of flattens on a large topology (mirrors the
/// AlarmSummaryService DebugViewSnapshot fan-out cap).
/// </summary>
private const int MaxConcurrentStalenessChecks = 8;
private bool _loading = true;
private string? _errorMessage;
private bool _actionInProgress;
private bool _recheckingStaleness;
private string _searchText = string.Empty;
@@ -178,7 +203,11 @@
InvokeAsync(async () =>
{
if (!_liveUpdates) return;
await LoadDataAsync();
// Staleness is deliberately OFF the live loop: it only changes when a
// template/instance is edited or redeployed, and recomputing it (a
// full re-flatten per deployed instance) every 15s is the page's
// dominant cost. The poll refreshes only the cheap deployed state.
await LoadDeployedStateAsync();
StateHasChanged();
});
}, null, LiveUpdatesInterval, LiveUpdatesInterval);
@@ -220,7 +249,25 @@
}
}
// Full reload: the cheap deployed state plus a one-shot staleness recompute.
// Used on initial load, the manual "Refresh" button, and after any mutating
// action (deploy/enable/disable/delete/move/rename) — all of which can change
// staleness. The live-update timer deliberately does NOT call this; it calls
// LoadDeployedStateAsync alone (staleness is off the 15s poll).
private async Task LoadDataAsync()
{
await LoadDeployedStateAsync();
await ComputeStalenessAsync();
}
/// <summary>
/// Fast path — loads the cheap deployed state (sites, instances, areas →
/// hierarchy + State badge) WITHOUT recomputing per-instance staleness. These
/// are all single, indexed central-DB reads. This is exactly what the
/// live-update timer polls every 15s. Internal so the live-update regression
/// test can drive the timer's code path directly.
/// </summary>
internal async Task LoadDeployedStateAsync()
{
_loading = true;
_errorMessage = null;
@@ -242,20 +289,6 @@
_allAreas.AddRange(areas);
}
_stalenessMap.Clear();
foreach (var inst in _allInstances.Where(i => i.State != InstanceState.NotDeployed))
{
try
{
var comparison = await DeploymentService.GetDeploymentComparisonAsync(inst.Id);
_stalenessMap[inst.Id] = comparison.IsSuccess && comparison.Value.IsStale;
}
catch
{
_stalenessMap[inst.Id] = false;
}
}
BuildTree();
}
catch (Exception ex)
@@ -265,6 +298,81 @@
_loading = false;
}
/// <summary>
/// Expensive path — recomputes the Stale/Current badge for every deployed
/// instance via <c>DeploymentService.GetDeploymentComparisonAsync</c>, which
/// re-flattens each instance's current config and compares it to the deployed
/// snapshot. This is the page's dominant cost, so it runs only on initial
/// load and on demand (the "Re-check staleness" button) — never on the 15s
/// live-update loop. The per-instance comparisons fan out with bounded
/// concurrency (<see cref="MaxConcurrentStalenessChecks"/>), each on its own
/// DI scope so the parallel flattens never share the circuit-scoped DbContext
/// (mirrors KpiHistoryQueryService's per-query scope).
/// </summary>
private async Task ComputeStalenessAsync()
{
var deployedIds = _allInstances
.Where(i => i.State != InstanceState.NotDeployed)
.Select(i => i.Id)
.ToList();
if (deployedIds.Count == 0)
{
_stalenessMap = new Dictionary<int, bool>();
BuildTree();
return;
}
var results = new ConcurrentDictionary<int, bool>();
using var gate = new SemaphoreSlim(MaxConcurrentStalenessChecks, MaxConcurrentStalenessChecks);
await Task.WhenAll(deployedIds.Select(id => ComputeOneStalenessAsync(id, gate, results)));
_stalenessMap = new Dictionary<int, bool>(results);
BuildTree();
}
private async Task ComputeOneStalenessAsync(
int instanceId, SemaphoreSlim gate, ConcurrentDictionary<int, bool> results)
{
await gate.WaitAsync();
try
{
// Fresh DI scope per comparison: GetDeploymentComparisonAsync re-flattens
// through the scoped DbContext, which must not be shared across the
// concurrent fan-out. Each scope gets its own DbContext.
await using var scope = ScopeFactory.CreateAsyncScope();
var deploymentService = scope.ServiceProvider.GetRequiredService<DeploymentService>();
var comparison = await deploymentService.GetDeploymentComparisonAsync(instanceId);
results[instanceId] = comparison.IsSuccess && comparison.Value.IsStale;
}
catch
{
results[instanceId] = false;
}
finally
{
gate.Release();
}
}
// Manual "Re-check staleness" toolbar action. Shows a spinner while the
// bounded-parallel one-shot runs, then refreshes the Stale/Current badges.
private async Task RecheckStalenessAsync()
{
if (_recheckingStaleness) return;
_recheckingStaleness = true;
StateHasChanged();
try
{
await ComputeStalenessAsync();
}
finally
{
_recheckingStaleness = false;
}
}
private void OnSearchChanged()
{
BuildTree();
@@ -387,6 +387,93 @@ public class TopologyPageTests : BunitContext
Assert.Contains("Changed", markup);
}
/// <summary>
/// Seeds one site with one deployed (Enabled) instance whose deployment
/// comparison resolves: a deployed snapshot plus a current flatten. Used by
/// the staleness-lifecycle tests below, which assert on how OFTEN the
/// comparison runs (via the snapshot repository call count), not on the
/// resulting Stale/Current value.
/// </summary>
private void SeedOneDeployedInstanceWithComparison()
{
SeedRepos(
sites: new[] { new Site("Plant-A", "plant-a") { Id = 1 } },
instances: new[]
{
new Instance("Pump-001") { Id = 100, SiteId = 1, State = InstanceState.Enabled }
});
var deployedConfig = new FlattenedConfiguration { InstanceUniqueName = "Pump-001" };
_deployRepo.GetDeployedSnapshotByInstanceIdAsync(100, Arg.Any<CancellationToken>())
.Returns(Task.FromResult<DeployedConfigSnapshot?>(
new DeployedConfigSnapshot("dep-1", "hash-old",
JsonSerializer.Serialize(deployedConfig))));
var currentConfig = new FlattenedConfiguration { InstanceUniqueName = "Pump-001" };
_pipeline.FlattenAndValidateAsync(100, Arg.Any<CancellationToken>())
.Returns(Task.FromResult(Result<FlatteningPipelineResult>.Success(
new FlatteningPipelineResult(currentConfig, "hash-new", ValidationResult.Success()))));
}
[Fact]
public async Task LiveUpdate_DoesNotRecomputeStaleness()
{
// Performance regression guard: staleness (the expensive per-instance
// re-flatten via DeploymentService.GetDeploymentComparisonAsync) must NOT
// run on the 15s live-update poll. The live-update path refreshes only the
// cheap deployed state (LoadDeployedStateAsync), which the timer invokes
// directly. The comparison reaches into the snapshot repository, so its
// call count is the proxy for "did staleness recompute".
SeedOneDeployedInstanceWithComparison();
var cut = Render<TopologyPage>();
// Initial load computed staleness exactly once.
await _deployRepo.Received(1)
.GetDeployedSnapshotByInstanceIdAsync(100, Arg.Any<CancellationToken>());
// Drive the EXACT code path the live-update timer runs.
await cut.InvokeAsync(() => cut.Instance.LoadDeployedStateAsync());
// Staleness was NOT recomputed by the live-update refresh — still once.
await _deployRepo.Received(1)
.GetDeployedSnapshotByInstanceIdAsync(100, Arg.Any<CancellationToken>());
}
[Fact]
public void DeployedState_Renders_AfterSplit()
{
// After splitting the load into fast-state vs expensive-staleness, the
// cheap deployed state (hierarchy + State badge) must still render.
SeedOneDeployedInstanceWithComparison();
var cut = Render<TopologyPage>();
FindToggleForLabel(cut, "Plant-A")!.Click();
Assert.Contains("Pump-001", cut.Markup);
Assert.Contains("Enabled", cut.Markup);
// The deployed instance shows a staleness badge (Stale or Current).
Assert.Matches("Stale|Current", cut.Markup);
}
[Fact]
public void ManualRecheck_RecomputesStaleness()
{
// The "Re-check staleness" button lets operators refresh staleness on
// demand (since it's off the live loop). Clicking it runs the comparison
// again — a second snapshot-repository call.
SeedOneDeployedInstanceWithComparison();
var cut = Render<TopologyPage>();
_deployRepo.Received(1)
.GetDeployedSnapshotByInstanceIdAsync(100, Arg.Any<CancellationToken>());
cut.Find("button[aria-label='Re-check staleness']").Click();
_deployRepo.Received(2)
.GetDeployedSnapshotByInstanceIdAsync(100, Arg.Any<CancellationToken>());
}
[Fact]
public void LegacyInstancesRoute_IsDeclaredOnTopologyPage()
{