Admin /hosts red-badge + resilience columns + Polly telemetry observer. Closes task #164 (the remaining slice of Phase 6.1 Stream E.3 after the earlier publisher + hub PR). Three cooperating pieces wired together so the operator-facing /hosts table actually reflects the live Polly counters that the pipeline builder is producing. DriverResiliencePipelineBuilder gains an optional DriverResilienceStatusTracker ctor param — when non-null, every built pipeline wires Polly's OnRetry/OnOpened/OnClosed strategy-options callbacks into the tracker. OnRetry → tracker.RecordFailure (so ConsecutiveFailures climbs per retry), OnOpened → tracker.RecordBreakerOpen (stamps LastCircuitBreakerOpenUtc), OnClosed → tracker.RecordSuccess (resets the failure counter once the target recovers). Absent tracker = silent, preserving the unit-test constructor path + any deployment that doesn't care about resilience observability. Cancellation stays excluded from the failure count via the existing ShouldHandle predicate. HostStatusService.HostStatusRow extends with four new fields — ConsecutiveFailures, LastCircuitBreakerOpenUtc, CurrentBulkheadDepth, LastRecycleUtc — populated via a second LEFT JOIN onto DriverInstanceResilienceStatuses keyed on (DriverInstanceId, HostName). LEFT JOIN because brand-new hosts haven't been sampled yet; a missing row means zero failures + never-opened breaker, which is the correct default. New FailureFlagThreshold constant (=3, matches plan decision #143's conservative half-of-breaker convention) + IsFlagged predicate so the UI can pre-warn before the breaker actually trips. Hosts.razor paints three new columns between State and Last-transition — Fail# (bold red when flagged), In-flight (bulkhead-depth proxy), Breaker-opened (relative age). Per-row "Flagged" red badge alongside State when IsFlagged is true. Above the first cluster table, a red alert banner summarises the flagged-host count when ≥1 host is flagged, so operators see the problem before scanning rows. Three new tests in DriverResiliencePipelineBuilderTests — Tracker_RecordsFailure_OnEveryRetry verifies ConsecutiveFailures reaches RetryCount after a transient-forever operation, Tracker_StampsBreakerOpen_WhenBreakerTrips verifies LastBreakerOpenUtc is set after threshold failures on a Write pipeline, Tracker_IsolatesCounters_PerHost verifies one dead host does not leak failure counts into a healthy sibling. Full suite — Core.Tests 14/14 resilience-builder tests passing (11 existing + 3 new), Admin.Tests 72/72 passing, Admin project builds 0 errors. SignalR live push of status changes + browser visual review are deliberately left to a follow-up — this PR keeps the structural change minimal (polling refresh already exists in the page's 10s timer; SignalR would be a structural add that touches hub registration + client subscription).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Joseph Doherty
2026-04-19 21:35:54 -04:00
parent 5536e96b46
commit d06cc01a48
4 changed files with 169 additions and 16 deletions

View File

@@ -56,6 +56,16 @@ else
</div></div></div>
</div>
@if (_rows.Any(HostStatusService.IsFlagged))
{
var flaggedCount = _rows.Count(HostStatusService.IsFlagged);
<div class="alert alert-danger small mb-3">
<strong>@flaggedCount host@(flaggedCount == 1 ? "" : "s")</strong>
reporting ≥ @HostStatusService.FailureFlagThreshold consecutive failures — circuit breaker
may trip soon. Inspect the resilience columns below to locate.
</div>
}
@foreach (var cluster in _rows.GroupBy(r => r.ClusterId ?? "(unassigned)").OrderBy(g => g.Key))
{
<h2 class="h5 mt-4">Cluster: <code>@cluster.Key</code></h2>
@@ -66,6 +76,9 @@ else
<th>Driver</th>
<th>Host</th>
<th>State</th>
<th class="text-end" title="Consecutive failures — resets when a call succeeds or the breaker closes">Fail#</th>
<th class="text-end" title="In-flight capability calls (bulkhead-depth proxy)">In-flight</th>
<th>Breaker opened</th>
<th>Last transition</th>
<th>Last seen</th>
<th>Detail</th>
@@ -84,10 +97,21 @@ else
{
<span class="badge bg-warning text-dark ms-1">Stale</span>
}
@if (HostStatusService.IsFlagged(r))
{
<span class="badge bg-danger ms-1" title="≥ @HostStatusService.FailureFlagThreshold consecutive failures">Flagged</span>
}
</td>
<td class="text-end small @(HostStatusService.IsFlagged(r) ? "text-danger fw-bold" : "")">
@r.ConsecutiveFailures
</td>
<td class="text-end small">@r.CurrentBulkheadDepth</td>
<td class="small">
@(r.LastCircuitBreakerOpenUtc is null ? "—" : FormatAge(r.LastCircuitBreakerOpenUtc.Value))
</td>
<td class="small">@FormatAge(r.StateChangedUtc)</td>
<td class="small @(HostStatusService.IsStale(r) ? "text-warning" : "")">@FormatAge(r.LastSeenUtc)</td>
<td class="text-truncate small" style="max-width: 320px;" title="@r.Detail">@r.Detail</td>
<td class="text-truncate small" style="max-width: 240px;" title="@r.Detail">@r.Detail</td>
</tr>
}
</tbody>

View File

@@ -7,8 +7,9 @@ namespace ZB.MOM.WW.OtOpcUa.Admin.Services;
/// <summary>
/// One row per <see cref="DriverHostStatus"/> record, enriched with the owning
/// <c>ClusterNode.ClusterId</c> when available (left-join). The Admin <c>/hosts</c> page
/// groups by cluster and renders a per-node → per-driver → per-host tree.
/// <c>ClusterNode.ClusterId</c> (left-join) + the per-<c>(DriverInstanceId, HostName)</c>
/// <see cref="DriverInstanceResilienceStatus"/> counters (also left-join) so the Admin
/// <c>/hosts</c> page renders the resilience surface inline with host state.
/// </summary>
public sealed record HostStatusRow(
string NodeId,
@@ -18,7 +19,11 @@ public sealed record HostStatusRow(
DriverHostState State,
DateTime StateChangedUtc,
DateTime LastSeenUtc,
string? Detail);
string? Detail,
int ConsecutiveFailures,
DateTime? LastCircuitBreakerOpenUtc,
int CurrentBulkheadDepth,
DateTime? LastRecycleUtc);
/// <summary>
/// Read-side service for the Admin UI's per-host drill-down. Loads
@@ -36,15 +41,26 @@ public sealed class HostStatusService(OtOpcUaConfigDbContext db)
{
public static readonly TimeSpan StaleThreshold = TimeSpan.FromSeconds(30);
/// <summary>Consecutive-failure threshold at which <see cref="IsFlagged"/> returns <c>true</c>
/// so the Admin UI can paint a red badge. Matches Phase 6.1 decision #143's conservative
/// half-of-breaker-threshold convention — flags before the breaker actually opens.</summary>
public const int FailureFlagThreshold = 3;
public async Task<IReadOnlyList<HostStatusRow>> ListAsync(CancellationToken ct = default)
{
// LEFT JOIN on NodeId so a row persists even when its owning ClusterNode row hasn't
// been created yet (first-boot bootstrap case — keeps the UI from losing sight of
// the reporting server).
// Two LEFT JOINs:
// 1. ClusterNodes on NodeId — row persists even when its owning ClusterNode row
// hasn't been created yet (first-boot bootstrap case).
// 2. DriverInstanceResilienceStatuses on (DriverInstanceId, HostName) — resilience
// counters haven't been sampled yet for brand-new hosts, so a missing row means
// zero failures + never-opened breaker.
var rows = await (from s in db.DriverHostStatuses.AsNoTracking()
join n in db.ClusterNodes.AsNoTracking()
on s.NodeId equals n.NodeId into nodeJoin
from n in nodeJoin.DefaultIfEmpty()
join r in db.DriverInstanceResilienceStatuses.AsNoTracking()
on new { s.DriverInstanceId, s.HostName } equals new { r.DriverInstanceId, r.HostName } into resilJoin
from r in resilJoin.DefaultIfEmpty()
orderby s.NodeId, s.DriverInstanceId, s.HostName
select new HostStatusRow(
s.NodeId,
@@ -54,10 +70,21 @@ public sealed class HostStatusService(OtOpcUaConfigDbContext db)
s.State,
s.StateChangedUtc,
s.LastSeenUtc,
s.Detail)).ToListAsync(ct);
s.Detail,
r != null ? r.ConsecutiveFailures : 0,
r != null ? r.LastCircuitBreakerOpenUtc : null,
r != null ? r.CurrentBulkheadDepth : 0,
r != null ? r.LastRecycleUtc : null)).ToListAsync(ct);
return rows;
}
public static bool IsStale(HostStatusRow row) =>
DateTime.UtcNow - row.LastSeenUtc > StaleThreshold;
/// <summary>
/// Red-badge predicate — <c>true</c> when the host has accumulated enough consecutive
/// failures that an operator should take notice before the breaker trips.
/// </summary>
public static bool IsFlagged(HostStatusRow row) =>
row.ConsecutiveFailures >= FailureFlagThreshold;
}