Compare commits
2 Commits
abcip-udt-
...
admin-host
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d06cc01a48 | ||
| 5536e96b46 |
@@ -56,6 +56,16 @@ else
|
||||
</div></div></div>
|
||||
</div>
|
||||
|
||||
@if (_rows.Any(HostStatusService.IsFlagged))
|
||||
{
|
||||
var flaggedCount = _rows.Count(HostStatusService.IsFlagged);
|
||||
<div class="alert alert-danger small mb-3">
|
||||
<strong>@flaggedCount host@(flaggedCount == 1 ? "" : "s")</strong>
|
||||
reporting ≥ @HostStatusService.FailureFlagThreshold consecutive failures — circuit breaker
|
||||
may trip soon. Inspect the resilience columns below to locate.
|
||||
</div>
|
||||
}
|
||||
|
||||
@foreach (var cluster in _rows.GroupBy(r => r.ClusterId ?? "(unassigned)").OrderBy(g => g.Key))
|
||||
{
|
||||
<h2 class="h5 mt-4">Cluster: <code>@cluster.Key</code></h2>
|
||||
@@ -66,6 +76,9 @@ else
|
||||
<th>Driver</th>
|
||||
<th>Host</th>
|
||||
<th>State</th>
|
||||
<th class="text-end" title="Consecutive failures — resets when a call succeeds or the breaker closes">Fail#</th>
|
||||
<th class="text-end" title="In-flight capability calls (bulkhead-depth proxy)">In-flight</th>
|
||||
<th>Breaker opened</th>
|
||||
<th>Last transition</th>
|
||||
<th>Last seen</th>
|
||||
<th>Detail</th>
|
||||
@@ -84,10 +97,21 @@ else
|
||||
{
|
||||
<span class="badge bg-warning text-dark ms-1">Stale</span>
|
||||
}
|
||||
@if (HostStatusService.IsFlagged(r))
|
||||
{
|
||||
<span class="badge bg-danger ms-1" title="≥ @HostStatusService.FailureFlagThreshold consecutive failures">Flagged</span>
|
||||
}
|
||||
</td>
|
||||
<td class="text-end small @(HostStatusService.IsFlagged(r) ? "text-danger fw-bold" : "")">
|
||||
@r.ConsecutiveFailures
|
||||
</td>
|
||||
<td class="text-end small">@r.CurrentBulkheadDepth</td>
|
||||
<td class="small">
|
||||
@(r.LastCircuitBreakerOpenUtc is null ? "—" : FormatAge(r.LastCircuitBreakerOpenUtc.Value))
|
||||
</td>
|
||||
<td class="small">@FormatAge(r.StateChangedUtc)</td>
|
||||
<td class="small @(HostStatusService.IsStale(r) ? "text-warning" : "")">@FormatAge(r.LastSeenUtc)</td>
|
||||
<td class="text-truncate small" style="max-width: 320px;" title="@r.Detail">@r.Detail</td>
|
||||
<td class="text-truncate small" style="max-width: 240px;" title="@r.Detail">@r.Detail</td>
|
||||
</tr>
|
||||
}
|
||||
</tbody>
|
||||
|
||||
@@ -7,8 +7,9 @@ namespace ZB.MOM.WW.OtOpcUa.Admin.Services;
|
||||
|
||||
/// <summary>
|
||||
/// One row per <see cref="DriverHostStatus"/> record, enriched with the owning
|
||||
/// <c>ClusterNode.ClusterId</c> when available (left-join). The Admin <c>/hosts</c> page
|
||||
/// groups by cluster and renders a per-node → per-driver → per-host tree.
|
||||
/// <c>ClusterNode.ClusterId</c> (left-join) + the per-<c>(DriverInstanceId, HostName)</c>
|
||||
/// <see cref="DriverInstanceResilienceStatus"/> counters (also left-join) so the Admin
|
||||
/// <c>/hosts</c> page renders the resilience surface inline with host state.
|
||||
/// </summary>
|
||||
public sealed record HostStatusRow(
|
||||
string NodeId,
|
||||
@@ -18,7 +19,11 @@ public sealed record HostStatusRow(
|
||||
DriverHostState State,
|
||||
DateTime StateChangedUtc,
|
||||
DateTime LastSeenUtc,
|
||||
string? Detail);
|
||||
string? Detail,
|
||||
int ConsecutiveFailures,
|
||||
DateTime? LastCircuitBreakerOpenUtc,
|
||||
int CurrentBulkheadDepth,
|
||||
DateTime? LastRecycleUtc);
|
||||
|
||||
/// <summary>
|
||||
/// Read-side service for the Admin UI's per-host drill-down. Loads
|
||||
@@ -36,15 +41,26 @@ public sealed class HostStatusService(OtOpcUaConfigDbContext db)
|
||||
{
|
||||
public static readonly TimeSpan StaleThreshold = TimeSpan.FromSeconds(30);
|
||||
|
||||
/// <summary>Consecutive-failure threshold at which <see cref="IsFlagged"/> returns <c>true</c>
|
||||
/// so the Admin UI can paint a red badge. Matches Phase 6.1 decision #143's conservative
|
||||
/// half-of-breaker-threshold convention — flags before the breaker actually opens.</summary>
|
||||
public const int FailureFlagThreshold = 3;
|
||||
|
||||
public async Task<IReadOnlyList<HostStatusRow>> ListAsync(CancellationToken ct = default)
|
||||
{
|
||||
// LEFT JOIN on NodeId so a row persists even when its owning ClusterNode row hasn't
|
||||
// been created yet (first-boot bootstrap case — keeps the UI from losing sight of
|
||||
// the reporting server).
|
||||
// Two LEFT JOINs:
|
||||
// 1. ClusterNodes on NodeId — row persists even when its owning ClusterNode row
|
||||
// hasn't been created yet (first-boot bootstrap case).
|
||||
// 2. DriverInstanceResilienceStatuses on (DriverInstanceId, HostName) — resilience
|
||||
// counters haven't been sampled yet for brand-new hosts, so a missing row means
|
||||
// zero failures + never-opened breaker.
|
||||
var rows = await (from s in db.DriverHostStatuses.AsNoTracking()
|
||||
join n in db.ClusterNodes.AsNoTracking()
|
||||
on s.NodeId equals n.NodeId into nodeJoin
|
||||
from n in nodeJoin.DefaultIfEmpty()
|
||||
join r in db.DriverInstanceResilienceStatuses.AsNoTracking()
|
||||
on new { s.DriverInstanceId, s.HostName } equals new { r.DriverInstanceId, r.HostName } into resilJoin
|
||||
from r in resilJoin.DefaultIfEmpty()
|
||||
orderby s.NodeId, s.DriverInstanceId, s.HostName
|
||||
select new HostStatusRow(
|
||||
s.NodeId,
|
||||
@@ -54,10 +70,21 @@ public sealed class HostStatusService(OtOpcUaConfigDbContext db)
|
||||
s.State,
|
||||
s.StateChangedUtc,
|
||||
s.LastSeenUtc,
|
||||
s.Detail)).ToListAsync(ct);
|
||||
s.Detail,
|
||||
r != null ? r.ConsecutiveFailures : 0,
|
||||
r != null ? r.LastCircuitBreakerOpenUtc : null,
|
||||
r != null ? r.CurrentBulkheadDepth : 0,
|
||||
r != null ? r.LastRecycleUtc : null)).ToListAsync(ct);
|
||||
return rows;
|
||||
}
|
||||
|
||||
public static bool IsStale(HostStatusRow row) =>
|
||||
DateTime.UtcNow - row.LastSeenUtc > StaleThreshold;
|
||||
|
||||
/// <summary>
|
||||
/// Red-badge predicate — <c>true</c> when the host has accumulated enough consecutive
|
||||
/// failures that an operator should take notice before the breaker trips.
|
||||
/// </summary>
|
||||
public static bool IsFlagged(HostStatusRow row) =>
|
||||
row.ConsecutiveFailures >= FailureFlagThreshold;
|
||||
}
|
||||
|
||||
@@ -24,11 +24,21 @@ public sealed class DriverResiliencePipelineBuilder
|
||||
{
|
||||
private readonly ConcurrentDictionary<PipelineKey, ResiliencePipeline> _pipelines = new();
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly DriverResilienceStatusTracker? _statusTracker;
|
||||
|
||||
/// <summary>Construct with the ambient clock (use <see cref="TimeProvider.System"/> in prod).</summary>
|
||||
public DriverResiliencePipelineBuilder(TimeProvider? timeProvider = null)
|
||||
/// <param name="timeProvider">Clock source for pipeline timeouts + breaker sampling. Defaults to system.</param>
|
||||
/// <param name="statusTracker">When non-null, every built pipeline wires Polly telemetry into
|
||||
/// the tracker — retries increment <c>ConsecutiveFailures</c>, breaker-open stamps
|
||||
/// <c>LastBreakerOpenUtc</c>, breaker-close resets failures. Feeds Admin <c>/hosts</c> +
|
||||
/// the Polly bulkhead-depth column. Absent tracker means no telemetry (unit tests +
|
||||
/// deployments that don't care about resilience observability).</param>
|
||||
public DriverResiliencePipelineBuilder(
|
||||
TimeProvider? timeProvider = null,
|
||||
DriverResilienceStatusTracker? statusTracker = null)
|
||||
{
|
||||
_timeProvider = timeProvider ?? TimeProvider.System;
|
||||
_statusTracker = statusTracker;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
@@ -54,8 +64,9 @@ public sealed class DriverResiliencePipelineBuilder
|
||||
ArgumentException.ThrowIfNullOrWhiteSpace(hostName);
|
||||
|
||||
var key = new PipelineKey(driverInstanceId, hostName, capability);
|
||||
return _pipelines.GetOrAdd(key, static (_, state) => Build(state.capability, state.options, state.timeProvider),
|
||||
(capability, options, timeProvider: _timeProvider));
|
||||
return _pipelines.GetOrAdd(key, static (k, state) => Build(
|
||||
k.DriverInstanceId, k.HostName, state.capability, state.options, state.timeProvider, state.tracker),
|
||||
(capability, options, timeProvider: _timeProvider, tracker: _statusTracker));
|
||||
}
|
||||
|
||||
/// <summary>Drop cached pipelines for one driver instance (e.g. on ResilienceConfig change). Test + Admin-reload use.</summary>
|
||||
@@ -74,9 +85,12 @@ public sealed class DriverResiliencePipelineBuilder
|
||||
public int CachedPipelineCount => _pipelines.Count;
|
||||
|
||||
private static ResiliencePipeline Build(
|
||||
string driverInstanceId,
|
||||
string hostName,
|
||||
DriverCapability capability,
|
||||
DriverResilienceOptions options,
|
||||
TimeProvider timeProvider)
|
||||
TimeProvider timeProvider,
|
||||
DriverResilienceStatusTracker? tracker)
|
||||
{
|
||||
var policy = options.Resolve(capability);
|
||||
var builder = new ResiliencePipelineBuilder { TimeProvider = timeProvider };
|
||||
@@ -88,7 +102,7 @@ public sealed class DriverResiliencePipelineBuilder
|
||||
|
||||
if (policy.RetryCount > 0)
|
||||
{
|
||||
builder.AddRetry(new RetryStrategyOptions
|
||||
var retryOptions = new RetryStrategyOptions
|
||||
{
|
||||
MaxRetryAttempts = policy.RetryCount,
|
||||
BackoffType = DelayBackoffType.Exponential,
|
||||
@@ -96,19 +110,44 @@ public sealed class DriverResiliencePipelineBuilder
|
||||
Delay = TimeSpan.FromMilliseconds(100),
|
||||
MaxDelay = TimeSpan.FromSeconds(5),
|
||||
ShouldHandle = new PredicateBuilder().Handle<Exception>(ex => ex is not OperationCanceledException),
|
||||
});
|
||||
};
|
||||
if (tracker is not null)
|
||||
{
|
||||
retryOptions.OnRetry = args =>
|
||||
{
|
||||
tracker.RecordFailure(driverInstanceId, hostName, timeProvider.GetUtcNow().UtcDateTime);
|
||||
return default;
|
||||
};
|
||||
}
|
||||
builder.AddRetry(retryOptions);
|
||||
}
|
||||
|
||||
if (policy.BreakerFailureThreshold > 0)
|
||||
{
|
||||
builder.AddCircuitBreaker(new CircuitBreakerStrategyOptions
|
||||
var breakerOptions = new CircuitBreakerStrategyOptions
|
||||
{
|
||||
FailureRatio = 1.0,
|
||||
MinimumThroughput = policy.BreakerFailureThreshold,
|
||||
SamplingDuration = TimeSpan.FromSeconds(30),
|
||||
BreakDuration = TimeSpan.FromSeconds(15),
|
||||
ShouldHandle = new PredicateBuilder().Handle<Exception>(ex => ex is not OperationCanceledException),
|
||||
});
|
||||
};
|
||||
if (tracker is not null)
|
||||
{
|
||||
breakerOptions.OnOpened = args =>
|
||||
{
|
||||
tracker.RecordBreakerOpen(driverInstanceId, hostName, timeProvider.GetUtcNow().UtcDateTime);
|
||||
return default;
|
||||
};
|
||||
breakerOptions.OnClosed = args =>
|
||||
{
|
||||
// Closing the breaker means the target recovered — reset the consecutive-
|
||||
// failure counter so Admin UI stops flashing red for this host.
|
||||
tracker.RecordSuccess(driverInstanceId, hostName, timeProvider.GetUtcNow().UtcDateTime);
|
||||
return default;
|
||||
};
|
||||
}
|
||||
builder.AddCircuitBreaker(breakerOptions);
|
||||
}
|
||||
|
||||
return builder.Build();
|
||||
|
||||
@@ -219,4 +219,67 @@ public sealed class DriverResiliencePipelineBuilderTests
|
||||
|
||||
attempts.ShouldBeLessThanOrEqualTo(1);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Tracker_RecordsFailure_OnEveryRetry()
|
||||
{
|
||||
var tracker = new DriverResilienceStatusTracker();
|
||||
var builder = new DriverResiliencePipelineBuilder(statusTracker: tracker);
|
||||
var pipeline = builder.GetOrCreate("drv-trk", "host-x", DriverCapability.Read, TierAOptions);
|
||||
|
||||
await Should.ThrowAsync<InvalidOperationException>(async () =>
|
||||
await pipeline.ExecuteAsync(async _ =>
|
||||
{
|
||||
await Task.Yield();
|
||||
throw new InvalidOperationException("always fails");
|
||||
}));
|
||||
|
||||
var snap = tracker.TryGet("drv-trk", "host-x");
|
||||
snap.ShouldNotBeNull();
|
||||
var retryCount = TierAOptions.Resolve(DriverCapability.Read).RetryCount;
|
||||
snap!.ConsecutiveFailures.ShouldBe(retryCount);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Tracker_StampsBreakerOpen_WhenBreakerTrips()
|
||||
{
|
||||
var tracker = new DriverResilienceStatusTracker();
|
||||
var builder = new DriverResiliencePipelineBuilder(statusTracker: tracker);
|
||||
var pipeline = builder.GetOrCreate("drv-trk", "host-b", DriverCapability.Write, TierAOptions);
|
||||
|
||||
var threshold = TierAOptions.Resolve(DriverCapability.Write).BreakerFailureThreshold;
|
||||
for (var i = 0; i < threshold; i++)
|
||||
{
|
||||
await Should.ThrowAsync<InvalidOperationException>(async () =>
|
||||
await pipeline.ExecuteAsync(async _ =>
|
||||
{
|
||||
await Task.Yield();
|
||||
throw new InvalidOperationException("boom");
|
||||
}));
|
||||
}
|
||||
|
||||
var snap = tracker.TryGet("drv-trk", "host-b");
|
||||
snap.ShouldNotBeNull();
|
||||
snap!.LastBreakerOpenUtc.ShouldNotBeNull();
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Tracker_IsolatesCounters_PerHost()
|
||||
{
|
||||
var tracker = new DriverResilienceStatusTracker();
|
||||
var builder = new DriverResiliencePipelineBuilder(statusTracker: tracker);
|
||||
var dead = builder.GetOrCreate("drv-trk", "dead", DriverCapability.Read, TierAOptions);
|
||||
var live = builder.GetOrCreate("drv-trk", "live", DriverCapability.Read, TierAOptions);
|
||||
|
||||
await Should.ThrowAsync<InvalidOperationException>(async () =>
|
||||
await dead.ExecuteAsync(async _ =>
|
||||
{
|
||||
await Task.Yield();
|
||||
throw new InvalidOperationException("dead");
|
||||
}));
|
||||
await live.ExecuteAsync(async _ => await Task.Yield());
|
||||
|
||||
tracker.TryGet("drv-trk", "dead")!.ConsecutiveFailures.ShouldBeGreaterThan(0);
|
||||
tracker.TryGet("drv-trk", "live").ShouldBeNull();
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user