Admin /hosts red-badge + resilience columns + Polly telemetry observer (#164) #132

Merged
dohertj2 merged 1 commits from admin-hosts-refresh-red-badge into v2 2026-04-19 21:38:12 -04:00
4 changed files with 169 additions and 16 deletions

View File

@@ -56,6 +56,16 @@ else
</div></div></div>
</div>
@if (_rows.Any(HostStatusService.IsFlagged))
{
var flaggedCount = _rows.Count(HostStatusService.IsFlagged);
<div class="alert alert-danger small mb-3">
<strong>@flaggedCount host@(flaggedCount == 1 ? "" : "s")</strong>
reporting ≥ @HostStatusService.FailureFlagThreshold consecutive failures — circuit breaker
may trip soon. Inspect the resilience columns below to locate.
</div>
}
@foreach (var cluster in _rows.GroupBy(r => r.ClusterId ?? "(unassigned)").OrderBy(g => g.Key))
{
<h2 class="h5 mt-4">Cluster: <code>@cluster.Key</code></h2>
@@ -66,6 +76,9 @@ else
<th>Driver</th>
<th>Host</th>
<th>State</th>
<th class="text-end" title="Consecutive failures — resets when a call succeeds or the breaker closes">Fail#</th>
<th class="text-end" title="In-flight capability calls (bulkhead-depth proxy)">In-flight</th>
<th>Breaker opened</th>
<th>Last transition</th>
<th>Last seen</th>
<th>Detail</th>
@@ -84,10 +97,21 @@ else
{
<span class="badge bg-warning text-dark ms-1">Stale</span>
}
@if (HostStatusService.IsFlagged(r))
{
<span class="badge bg-danger ms-1" title="≥ @HostStatusService.FailureFlagThreshold consecutive failures">Flagged</span>
}
</td>
<td class="text-end small @(HostStatusService.IsFlagged(r) ? "text-danger fw-bold" : "")">
@r.ConsecutiveFailures
</td>
<td class="text-end small">@r.CurrentBulkheadDepth</td>
<td class="small">
@(r.LastCircuitBreakerOpenUtc is null ? "—" : FormatAge(r.LastCircuitBreakerOpenUtc.Value))
</td>
<td class="small">@FormatAge(r.StateChangedUtc)</td>
<td class="small @(HostStatusService.IsStale(r) ? "text-warning" : "")">@FormatAge(r.LastSeenUtc)</td>
<td class="text-truncate small" style="max-width: 320px;" title="@r.Detail">@r.Detail</td>
<td class="text-truncate small" style="max-width: 240px;" title="@r.Detail">@r.Detail</td>
</tr>
}
</tbody>

View File

@@ -7,8 +7,9 @@ namespace ZB.MOM.WW.OtOpcUa.Admin.Services;
/// <summary>
/// One row per <see cref="DriverHostStatus"/> record, enriched with the owning
/// <c>ClusterNode.ClusterId</c> when available (left-join). The Admin <c>/hosts</c> page
/// groups by cluster and renders a per-node → per-driver → per-host tree.
/// <c>ClusterNode.ClusterId</c> (left-join) + the per-<c>(DriverInstanceId, HostName)</c>
/// <see cref="DriverInstanceResilienceStatus"/> counters (also left-join) so the Admin
/// <c>/hosts</c> page renders the resilience surface inline with host state.
/// </summary>
public sealed record HostStatusRow(
string NodeId,
@@ -18,7 +19,11 @@ public sealed record HostStatusRow(
DriverHostState State,
DateTime StateChangedUtc,
DateTime LastSeenUtc,
string? Detail);
string? Detail,
int ConsecutiveFailures,
DateTime? LastCircuitBreakerOpenUtc,
int CurrentBulkheadDepth,
DateTime? LastRecycleUtc);
/// <summary>
/// Read-side service for the Admin UI's per-host drill-down. Loads
@@ -36,15 +41,26 @@ public sealed class HostStatusService(OtOpcUaConfigDbContext db)
{
public static readonly TimeSpan StaleThreshold = TimeSpan.FromSeconds(30);
/// <summary>Consecutive-failure threshold at which <see cref="IsFlagged"/> returns <c>true</c>
/// so the Admin UI can paint a red badge. Matches Phase 6.1 decision #143's conservative
/// half-of-breaker-threshold convention — flags before the breaker actually opens.</summary>
public const int FailureFlagThreshold = 3;
public async Task<IReadOnlyList<HostStatusRow>> ListAsync(CancellationToken ct = default)
{
// LEFT JOIN on NodeId so a row persists even when its owning ClusterNode row hasn't
// been created yet (first-boot bootstrap case — keeps the UI from losing sight of
// the reporting server).
// Two LEFT JOINs:
// 1. ClusterNodes on NodeId — row persists even when its owning ClusterNode row
// hasn't been created yet (first-boot bootstrap case).
// 2. DriverInstanceResilienceStatuses on (DriverInstanceId, HostName) — resilience
// counters haven't been sampled yet for brand-new hosts, so a missing row means
// zero failures + never-opened breaker.
var rows = await (from s in db.DriverHostStatuses.AsNoTracking()
join n in db.ClusterNodes.AsNoTracking()
on s.NodeId equals n.NodeId into nodeJoin
from n in nodeJoin.DefaultIfEmpty()
join r in db.DriverInstanceResilienceStatuses.AsNoTracking()
on new { s.DriverInstanceId, s.HostName } equals new { r.DriverInstanceId, r.HostName } into resilJoin
from r in resilJoin.DefaultIfEmpty()
orderby s.NodeId, s.DriverInstanceId, s.HostName
select new HostStatusRow(
s.NodeId,
@@ -54,10 +70,21 @@ public sealed class HostStatusService(OtOpcUaConfigDbContext db)
s.State,
s.StateChangedUtc,
s.LastSeenUtc,
s.Detail)).ToListAsync(ct);
s.Detail,
r != null ? r.ConsecutiveFailures : 0,
r != null ? r.LastCircuitBreakerOpenUtc : null,
r != null ? r.CurrentBulkheadDepth : 0,
r != null ? r.LastRecycleUtc : null)).ToListAsync(ct);
return rows;
}
public static bool IsStale(HostStatusRow row) =>
DateTime.UtcNow - row.LastSeenUtc > StaleThreshold;
/// <summary>
/// Red-badge predicate — <c>true</c> when the host has accumulated enough consecutive
/// failures that an operator should take notice before the breaker trips.
/// </summary>
public static bool IsFlagged(HostStatusRow row) =>
row.ConsecutiveFailures >= FailureFlagThreshold;
}

View File

@@ -24,11 +24,21 @@ public sealed class DriverResiliencePipelineBuilder
{
private readonly ConcurrentDictionary<PipelineKey, ResiliencePipeline> _pipelines = new();
private readonly TimeProvider _timeProvider;
private readonly DriverResilienceStatusTracker? _statusTracker;
/// <summary>Construct with the ambient clock (use <see cref="TimeProvider.System"/> in prod).</summary>
public DriverResiliencePipelineBuilder(TimeProvider? timeProvider = null)
/// <param name="timeProvider">Clock source for pipeline timeouts + breaker sampling. Defaults to system.</param>
/// <param name="statusTracker">When non-null, every built pipeline wires Polly telemetry into
/// the tracker — retries increment <c>ConsecutiveFailures</c>, breaker-open stamps
/// <c>LastBreakerOpenUtc</c>, breaker-close resets failures. Feeds Admin <c>/hosts</c> +
/// the Polly bulkhead-depth column. Absent tracker means no telemetry (unit tests +
/// deployments that don't care about resilience observability).</param>
public DriverResiliencePipelineBuilder(
TimeProvider? timeProvider = null,
DriverResilienceStatusTracker? statusTracker = null)
{
_timeProvider = timeProvider ?? TimeProvider.System;
_statusTracker = statusTracker;
}
/// <summary>
@@ -54,8 +64,9 @@ public sealed class DriverResiliencePipelineBuilder
ArgumentException.ThrowIfNullOrWhiteSpace(hostName);
var key = new PipelineKey(driverInstanceId, hostName, capability);
return _pipelines.GetOrAdd(key, static (_, state) => Build(state.capability, state.options, state.timeProvider),
(capability, options, timeProvider: _timeProvider));
return _pipelines.GetOrAdd(key, static (k, state) => Build(
k.DriverInstanceId, k.HostName, state.capability, state.options, state.timeProvider, state.tracker),
(capability, options, timeProvider: _timeProvider, tracker: _statusTracker));
}
/// <summary>Drop cached pipelines for one driver instance (e.g. on ResilienceConfig change). Test + Admin-reload use.</summary>
@@ -74,9 +85,12 @@ public sealed class DriverResiliencePipelineBuilder
public int CachedPipelineCount => _pipelines.Count;
private static ResiliencePipeline Build(
string driverInstanceId,
string hostName,
DriverCapability capability,
DriverResilienceOptions options,
TimeProvider timeProvider)
TimeProvider timeProvider,
DriverResilienceStatusTracker? tracker)
{
var policy = options.Resolve(capability);
var builder = new ResiliencePipelineBuilder { TimeProvider = timeProvider };
@@ -88,7 +102,7 @@ public sealed class DriverResiliencePipelineBuilder
if (policy.RetryCount > 0)
{
builder.AddRetry(new RetryStrategyOptions
var retryOptions = new RetryStrategyOptions
{
MaxRetryAttempts = policy.RetryCount,
BackoffType = DelayBackoffType.Exponential,
@@ -96,19 +110,44 @@ public sealed class DriverResiliencePipelineBuilder
Delay = TimeSpan.FromMilliseconds(100),
MaxDelay = TimeSpan.FromSeconds(5),
ShouldHandle = new PredicateBuilder().Handle<Exception>(ex => ex is not OperationCanceledException),
});
};
if (tracker is not null)
{
retryOptions.OnRetry = args =>
{
tracker.RecordFailure(driverInstanceId, hostName, timeProvider.GetUtcNow().UtcDateTime);
return default;
};
}
builder.AddRetry(retryOptions);
}
if (policy.BreakerFailureThreshold > 0)
{
builder.AddCircuitBreaker(new CircuitBreakerStrategyOptions
var breakerOptions = new CircuitBreakerStrategyOptions
{
FailureRatio = 1.0,
MinimumThroughput = policy.BreakerFailureThreshold,
SamplingDuration = TimeSpan.FromSeconds(30),
BreakDuration = TimeSpan.FromSeconds(15),
ShouldHandle = new PredicateBuilder().Handle<Exception>(ex => ex is not OperationCanceledException),
});
};
if (tracker is not null)
{
breakerOptions.OnOpened = args =>
{
tracker.RecordBreakerOpen(driverInstanceId, hostName, timeProvider.GetUtcNow().UtcDateTime);
return default;
};
breakerOptions.OnClosed = args =>
{
// Closing the breaker means the target recovered — reset the consecutive-
// failure counter so Admin UI stops flashing red for this host.
tracker.RecordSuccess(driverInstanceId, hostName, timeProvider.GetUtcNow().UtcDateTime);
return default;
};
}
builder.AddCircuitBreaker(breakerOptions);
}
return builder.Build();

View File

@@ -219,4 +219,67 @@ public sealed class DriverResiliencePipelineBuilderTests
attempts.ShouldBeLessThanOrEqualTo(1);
}
[Fact]
public async Task Tracker_RecordsFailure_OnEveryRetry()
{
var tracker = new DriverResilienceStatusTracker();
var builder = new DriverResiliencePipelineBuilder(statusTracker: tracker);
var pipeline = builder.GetOrCreate("drv-trk", "host-x", DriverCapability.Read, TierAOptions);
await Should.ThrowAsync<InvalidOperationException>(async () =>
await pipeline.ExecuteAsync(async _ =>
{
await Task.Yield();
throw new InvalidOperationException("always fails");
}));
var snap = tracker.TryGet("drv-trk", "host-x");
snap.ShouldNotBeNull();
var retryCount = TierAOptions.Resolve(DriverCapability.Read).RetryCount;
snap!.ConsecutiveFailures.ShouldBe(retryCount);
}
[Fact]
public async Task Tracker_StampsBreakerOpen_WhenBreakerTrips()
{
var tracker = new DriverResilienceStatusTracker();
var builder = new DriverResiliencePipelineBuilder(statusTracker: tracker);
var pipeline = builder.GetOrCreate("drv-trk", "host-b", DriverCapability.Write, TierAOptions);
var threshold = TierAOptions.Resolve(DriverCapability.Write).BreakerFailureThreshold;
for (var i = 0; i < threshold; i++)
{
await Should.ThrowAsync<InvalidOperationException>(async () =>
await pipeline.ExecuteAsync(async _ =>
{
await Task.Yield();
throw new InvalidOperationException("boom");
}));
}
var snap = tracker.TryGet("drv-trk", "host-b");
snap.ShouldNotBeNull();
snap!.LastBreakerOpenUtc.ShouldNotBeNull();
}
[Fact]
public async Task Tracker_IsolatesCounters_PerHost()
{
var tracker = new DriverResilienceStatusTracker();
var builder = new DriverResiliencePipelineBuilder(statusTracker: tracker);
var dead = builder.GetOrCreate("drv-trk", "dead", DriverCapability.Read, TierAOptions);
var live = builder.GetOrCreate("drv-trk", "live", DriverCapability.Read, TierAOptions);
await Should.ThrowAsync<InvalidOperationException>(async () =>
await dead.ExecuteAsync(async _ =>
{
await Task.Yield();
throw new InvalidOperationException("dead");
}));
await live.ExecuteAsync(async _ => await Task.Yield());
tracker.TryGet("drv-trk", "dead")!.ConsecutiveFailures.ShouldBeGreaterThan(0);
tracker.TryGet("drv-trk", "live").ShouldBeNull();
}
}