diff --git a/src/ZB.MOM.WW.OtOpcUa.Admin/Components/Pages/Hosts.razor b/src/ZB.MOM.WW.OtOpcUa.Admin/Components/Pages/Hosts.razor
index d6a3d0a..c916834 100644
--- a/src/ZB.MOM.WW.OtOpcUa.Admin/Components/Pages/Hosts.razor
+++ b/src/ZB.MOM.WW.OtOpcUa.Admin/Components/Pages/Hosts.razor
@@ -56,6 +56,16 @@ else
+ @if (_rows.Any(HostStatusService.IsFlagged))
+ {
+ var flaggedCount = _rows.Count(HostStatusService.IsFlagged);
+
+ @flaggedCount host@(flaggedCount == 1 ? "" : "s")
+ reporting ≥ @HostStatusService.FailureFlagThreshold consecutive failures — circuit breaker
+ may trip soon. Inspect the resilience columns below to locate.
+
+ }
+
@foreach (var cluster in _rows.GroupBy(r => r.ClusterId ?? "(unassigned)").OrderBy(g => g.Key))
{
Cluster: @cluster.Key
@@ -66,6 +76,9 @@ else
Driver |
Host |
State |
+ Fail# |
+ In-flight |
+ Breaker opened |
Last transition |
Last seen |
Detail |
@@ -84,10 +97,21 @@ else
{
Stale
}
+ @if (HostStatusService.IsFlagged(r))
+ {
+ Flagged
+ }
+
+
+ @r.ConsecutiveFailures
+ |
+ @r.CurrentBulkheadDepth |
+
+ @(r.LastCircuitBreakerOpenUtc is null ? "—" : FormatAge(r.LastCircuitBreakerOpenUtc.Value))
|
@FormatAge(r.StateChangedUtc) |
@FormatAge(r.LastSeenUtc) |
- @r.Detail |
+ @r.Detail |
}
diff --git a/src/ZB.MOM.WW.OtOpcUa.Admin/Services/HostStatusService.cs b/src/ZB.MOM.WW.OtOpcUa.Admin/Services/HostStatusService.cs
index fcd8ea3..f0a7881 100644
--- a/src/ZB.MOM.WW.OtOpcUa.Admin/Services/HostStatusService.cs
+++ b/src/ZB.MOM.WW.OtOpcUa.Admin/Services/HostStatusService.cs
@@ -7,8 +7,9 @@ namespace ZB.MOM.WW.OtOpcUa.Admin.Services;
///
/// One row per record, enriched with the owning
-/// ClusterNode.ClusterId when available (left-join). The Admin /hosts page
-/// groups by cluster and renders a per-node → per-driver → per-host tree.
+/// ClusterNode.ClusterId (left-join) + the per-(DriverInstanceId, HostName)
+/// counters (also left-join) so the Admin
+/// /hosts page renders the resilience surface inline with host state.
///
public sealed record HostStatusRow(
string NodeId,
@@ -18,7 +19,11 @@ public sealed record HostStatusRow(
DriverHostState State,
DateTime StateChangedUtc,
DateTime LastSeenUtc,
- string? Detail);
+ string? Detail,
+ int ConsecutiveFailures,
+ DateTime? LastCircuitBreakerOpenUtc,
+ int CurrentBulkheadDepth,
+ DateTime? LastRecycleUtc);
///
/// Read-side service for the Admin UI's per-host drill-down. Loads
@@ -36,15 +41,26 @@ public sealed class HostStatusService(OtOpcUaConfigDbContext db)
{
public static readonly TimeSpan StaleThreshold = TimeSpan.FromSeconds(30);
+ /// Consecutive-failure threshold at which returns true
+ /// so the Admin UI can paint a red badge. Matches Phase 6.1 decision #143's conservative
+ /// half-of-breaker-threshold convention — flags before the breaker actually opens.
+ public const int FailureFlagThreshold = 3;
+
public async Task> ListAsync(CancellationToken ct = default)
{
- // LEFT JOIN on NodeId so a row persists even when its owning ClusterNode row hasn't
- // been created yet (first-boot bootstrap case — keeps the UI from losing sight of
- // the reporting server).
+ // Two LEFT JOINs:
+ // 1. ClusterNodes on NodeId — row persists even when its owning ClusterNode row
+ // hasn't been created yet (first-boot bootstrap case).
+ // 2. DriverInstanceResilienceStatuses on (DriverInstanceId, HostName) — resilience
+ // counters haven't been sampled yet for brand-new hosts, so a missing row means
+ // zero failures + never-opened breaker.
var rows = await (from s in db.DriverHostStatuses.AsNoTracking()
join n in db.ClusterNodes.AsNoTracking()
on s.NodeId equals n.NodeId into nodeJoin
from n in nodeJoin.DefaultIfEmpty()
+ join r in db.DriverInstanceResilienceStatuses.AsNoTracking()
+ on new { s.DriverInstanceId, s.HostName } equals new { r.DriverInstanceId, r.HostName } into resilJoin
+ from r in resilJoin.DefaultIfEmpty()
orderby s.NodeId, s.DriverInstanceId, s.HostName
select new HostStatusRow(
s.NodeId,
@@ -54,10 +70,21 @@ public sealed class HostStatusService(OtOpcUaConfigDbContext db)
s.State,
s.StateChangedUtc,
s.LastSeenUtc,
- s.Detail)).ToListAsync(ct);
+ s.Detail,
+ r != null ? r.ConsecutiveFailures : 0,
+ r != null ? r.LastCircuitBreakerOpenUtc : null,
+ r != null ? r.CurrentBulkheadDepth : 0,
+ r != null ? r.LastRecycleUtc : null)).ToListAsync(ct);
return rows;
}
public static bool IsStale(HostStatusRow row) =>
DateTime.UtcNow - row.LastSeenUtc > StaleThreshold;
+
+ ///
+ /// Red-badge predicate — true when the host has accumulated enough consecutive
+ /// failures that an operator should take notice before the breaker trips.
+ ///
+ public static bool IsFlagged(HostStatusRow row) =>
+ row.ConsecutiveFailures >= FailureFlagThreshold;
}
diff --git a/src/ZB.MOM.WW.OtOpcUa.Core/Resilience/DriverResiliencePipelineBuilder.cs b/src/ZB.MOM.WW.OtOpcUa.Core/Resilience/DriverResiliencePipelineBuilder.cs
index d7e25af..c1b095e 100644
--- a/src/ZB.MOM.WW.OtOpcUa.Core/Resilience/DriverResiliencePipelineBuilder.cs
+++ b/src/ZB.MOM.WW.OtOpcUa.Core/Resilience/DriverResiliencePipelineBuilder.cs
@@ -24,11 +24,21 @@ public sealed class DriverResiliencePipelineBuilder
{
private readonly ConcurrentDictionary _pipelines = new();
private readonly TimeProvider _timeProvider;
+ private readonly DriverResilienceStatusTracker? _statusTracker;
/// Construct with the ambient clock (use in prod).
- public DriverResiliencePipelineBuilder(TimeProvider? timeProvider = null)
+ /// Clock source for pipeline timeouts + breaker sampling. Defaults to system.
+ /// When non-null, every built pipeline wires Polly telemetry into
+ /// the tracker — retries increment ConsecutiveFailures, breaker-open stamps
+ /// LastBreakerOpenUtc, breaker-close resets failures. Feeds Admin /hosts +
+ /// the Polly bulkhead-depth column. Absent tracker means no telemetry (unit tests +
+ /// deployments that don't care about resilience observability).
+ public DriverResiliencePipelineBuilder(
+ TimeProvider? timeProvider = null,
+ DriverResilienceStatusTracker? statusTracker = null)
{
_timeProvider = timeProvider ?? TimeProvider.System;
+ _statusTracker = statusTracker;
}
///
@@ -54,8 +64,9 @@ public sealed class DriverResiliencePipelineBuilder
ArgumentException.ThrowIfNullOrWhiteSpace(hostName);
var key = new PipelineKey(driverInstanceId, hostName, capability);
- return _pipelines.GetOrAdd(key, static (_, state) => Build(state.capability, state.options, state.timeProvider),
- (capability, options, timeProvider: _timeProvider));
+ return _pipelines.GetOrAdd(key, static (k, state) => Build(
+ k.DriverInstanceId, k.HostName, state.capability, state.options, state.timeProvider, state.tracker),
+ (capability, options, timeProvider: _timeProvider, tracker: _statusTracker));
}
/// Drop cached pipelines for one driver instance (e.g. on ResilienceConfig change). Test + Admin-reload use.
@@ -74,9 +85,12 @@ public sealed class DriverResiliencePipelineBuilder
public int CachedPipelineCount => _pipelines.Count;
private static ResiliencePipeline Build(
+ string driverInstanceId,
+ string hostName,
DriverCapability capability,
DriverResilienceOptions options,
- TimeProvider timeProvider)
+ TimeProvider timeProvider,
+ DriverResilienceStatusTracker? tracker)
{
var policy = options.Resolve(capability);
var builder = new ResiliencePipelineBuilder { TimeProvider = timeProvider };
@@ -88,7 +102,7 @@ public sealed class DriverResiliencePipelineBuilder
if (policy.RetryCount > 0)
{
- builder.AddRetry(new RetryStrategyOptions
+ var retryOptions = new RetryStrategyOptions
{
MaxRetryAttempts = policy.RetryCount,
BackoffType = DelayBackoffType.Exponential,
@@ -96,19 +110,44 @@ public sealed class DriverResiliencePipelineBuilder
Delay = TimeSpan.FromMilliseconds(100),
MaxDelay = TimeSpan.FromSeconds(5),
ShouldHandle = new PredicateBuilder().Handle(ex => ex is not OperationCanceledException),
- });
+ };
+ if (tracker is not null)
+ {
+ retryOptions.OnRetry = args =>
+ {
+ tracker.RecordFailure(driverInstanceId, hostName, timeProvider.GetUtcNow().UtcDateTime);
+ return default;
+ };
+ }
+ builder.AddRetry(retryOptions);
}
if (policy.BreakerFailureThreshold > 0)
{
- builder.AddCircuitBreaker(new CircuitBreakerStrategyOptions
+ var breakerOptions = new CircuitBreakerStrategyOptions
{
FailureRatio = 1.0,
MinimumThroughput = policy.BreakerFailureThreshold,
SamplingDuration = TimeSpan.FromSeconds(30),
BreakDuration = TimeSpan.FromSeconds(15),
ShouldHandle = new PredicateBuilder().Handle(ex => ex is not OperationCanceledException),
- });
+ };
+ if (tracker is not null)
+ {
+ breakerOptions.OnOpened = args =>
+ {
+ tracker.RecordBreakerOpen(driverInstanceId, hostName, timeProvider.GetUtcNow().UtcDateTime);
+ return default;
+ };
+ breakerOptions.OnClosed = args =>
+ {
+ // Closing the breaker means the target recovered — reset the consecutive-
+ // failure counter so Admin UI stops flashing red for this host.
+ tracker.RecordSuccess(driverInstanceId, hostName, timeProvider.GetUtcNow().UtcDateTime);
+ return default;
+ };
+ }
+ builder.AddCircuitBreaker(breakerOptions);
}
return builder.Build();
diff --git a/tests/ZB.MOM.WW.OtOpcUa.Core.Tests/Resilience/DriverResiliencePipelineBuilderTests.cs b/tests/ZB.MOM.WW.OtOpcUa.Core.Tests/Resilience/DriverResiliencePipelineBuilderTests.cs
index 1167c5b..1d31808 100644
--- a/tests/ZB.MOM.WW.OtOpcUa.Core.Tests/Resilience/DriverResiliencePipelineBuilderTests.cs
+++ b/tests/ZB.MOM.WW.OtOpcUa.Core.Tests/Resilience/DriverResiliencePipelineBuilderTests.cs
@@ -219,4 +219,67 @@ public sealed class DriverResiliencePipelineBuilderTests
attempts.ShouldBeLessThanOrEqualTo(1);
}
+
+ [Fact]
+ public async Task Tracker_RecordsFailure_OnEveryRetry()
+ {
+ var tracker = new DriverResilienceStatusTracker();
+ var builder = new DriverResiliencePipelineBuilder(statusTracker: tracker);
+ var pipeline = builder.GetOrCreate("drv-trk", "host-x", DriverCapability.Read, TierAOptions);
+
+ await Should.ThrowAsync(async () =>
+ await pipeline.ExecuteAsync(async _ =>
+ {
+ await Task.Yield();
+ throw new InvalidOperationException("always fails");
+ }));
+
+ var snap = tracker.TryGet("drv-trk", "host-x");
+ snap.ShouldNotBeNull();
+ var retryCount = TierAOptions.Resolve(DriverCapability.Read).RetryCount;
+ snap!.ConsecutiveFailures.ShouldBe(retryCount);
+ }
+
+ [Fact]
+ public async Task Tracker_StampsBreakerOpen_WhenBreakerTrips()
+ {
+ var tracker = new DriverResilienceStatusTracker();
+ var builder = new DriverResiliencePipelineBuilder(statusTracker: tracker);
+ var pipeline = builder.GetOrCreate("drv-trk", "host-b", DriverCapability.Write, TierAOptions);
+
+ var threshold = TierAOptions.Resolve(DriverCapability.Write).BreakerFailureThreshold;
+ for (var i = 0; i < threshold; i++)
+ {
+ await Should.ThrowAsync(async () =>
+ await pipeline.ExecuteAsync(async _ =>
+ {
+ await Task.Yield();
+ throw new InvalidOperationException("boom");
+ }));
+ }
+
+ var snap = tracker.TryGet("drv-trk", "host-b");
+ snap.ShouldNotBeNull();
+ snap!.LastBreakerOpenUtc.ShouldNotBeNull();
+ }
+
+ [Fact]
+ public async Task Tracker_IsolatesCounters_PerHost()
+ {
+ var tracker = new DriverResilienceStatusTracker();
+ var builder = new DriverResiliencePipelineBuilder(statusTracker: tracker);
+ var dead = builder.GetOrCreate("drv-trk", "dead", DriverCapability.Read, TierAOptions);
+ var live = builder.GetOrCreate("drv-trk", "live", DriverCapability.Read, TierAOptions);
+
+ await Should.ThrowAsync(async () =>
+ await dead.ExecuteAsync(async _ =>
+ {
+ await Task.Yield();
+ throw new InvalidOperationException("dead");
+ }));
+ await live.ExecuteAsync(async _ => await Task.Yield());
+
+ tracker.TryGet("drv-trk", "dead")!.ConsecutiveFailures.ShouldBeGreaterThan(0);
+ tracker.TryGet("drv-trk", "live").ShouldBeNull();
+ }
}