diff --git a/src/ZB.MOM.WW.OtOpcUa.Admin/Components/Pages/Hosts.razor b/src/ZB.MOM.WW.OtOpcUa.Admin/Components/Pages/Hosts.razor index d6a3d0a..c916834 100644 --- a/src/ZB.MOM.WW.OtOpcUa.Admin/Components/Pages/Hosts.razor +++ b/src/ZB.MOM.WW.OtOpcUa.Admin/Components/Pages/Hosts.razor @@ -56,6 +56,16 @@ else + @if (_rows.Any(HostStatusService.IsFlagged)) + { + var flaggedCount = _rows.Count(HostStatusService.IsFlagged); +
+ @flaggedCount host@(flaggedCount == 1 ? "" : "s") + reporting ≥ @HostStatusService.FailureFlagThreshold consecutive failures — circuit breaker + may trip soon. Inspect the resilience columns below to locate. +
+ } + @foreach (var cluster in _rows.GroupBy(r => r.ClusterId ?? "(unassigned)").OrderBy(g => g.Key)) {

Cluster: @cluster.Key

@@ -66,6 +76,9 @@ else Driver Host State + Fail# + In-flight + Breaker opened Last transition Last seen Detail @@ -84,10 +97,21 @@ else { Stale } + @if (HostStatusService.IsFlagged(r)) + { + Flagged + } + + + @r.ConsecutiveFailures + + @r.CurrentBulkheadDepth + + @(r.LastCircuitBreakerOpenUtc is null ? "—" : FormatAge(r.LastCircuitBreakerOpenUtc.Value)) @FormatAge(r.StateChangedUtc) @FormatAge(r.LastSeenUtc) - @r.Detail + @r.Detail } diff --git a/src/ZB.MOM.WW.OtOpcUa.Admin/Services/HostStatusService.cs b/src/ZB.MOM.WW.OtOpcUa.Admin/Services/HostStatusService.cs index fcd8ea3..f0a7881 100644 --- a/src/ZB.MOM.WW.OtOpcUa.Admin/Services/HostStatusService.cs +++ b/src/ZB.MOM.WW.OtOpcUa.Admin/Services/HostStatusService.cs @@ -7,8 +7,9 @@ namespace ZB.MOM.WW.OtOpcUa.Admin.Services; /// /// One row per record, enriched with the owning -/// ClusterNode.ClusterId when available (left-join). The Admin /hosts page -/// groups by cluster and renders a per-node → per-driver → per-host tree. +/// ClusterNode.ClusterId (left-join) + the per-(DriverInstanceId, HostName) +/// counters (also left-join) so the Admin +/// /hosts page renders the resilience surface inline with host state. /// public sealed record HostStatusRow( string NodeId, @@ -18,7 +19,11 @@ public sealed record HostStatusRow( DriverHostState State, DateTime StateChangedUtc, DateTime LastSeenUtc, - string? Detail); + string? Detail, + int ConsecutiveFailures, + DateTime? LastCircuitBreakerOpenUtc, + int CurrentBulkheadDepth, + DateTime? LastRecycleUtc); /// /// Read-side service for the Admin UI's per-host drill-down. Loads @@ -36,15 +41,26 @@ public sealed class HostStatusService(OtOpcUaConfigDbContext db) { public static readonly TimeSpan StaleThreshold = TimeSpan.FromSeconds(30); + /// Consecutive-failure threshold at which returns true + /// so the Admin UI can paint a red badge. Matches Phase 6.1 decision #143's conservative + /// half-of-breaker-threshold convention — flags before the breaker actually opens. + public const int FailureFlagThreshold = 3; + public async Task> ListAsync(CancellationToken ct = default) { - // LEFT JOIN on NodeId so a row persists even when its owning ClusterNode row hasn't - // been created yet (first-boot bootstrap case — keeps the UI from losing sight of - // the reporting server). + // Two LEFT JOINs: + // 1. ClusterNodes on NodeId — row persists even when its owning ClusterNode row + // hasn't been created yet (first-boot bootstrap case). + // 2. DriverInstanceResilienceStatuses on (DriverInstanceId, HostName) — resilience + // counters haven't been sampled yet for brand-new hosts, so a missing row means + // zero failures + never-opened breaker. var rows = await (from s in db.DriverHostStatuses.AsNoTracking() join n in db.ClusterNodes.AsNoTracking() on s.NodeId equals n.NodeId into nodeJoin from n in nodeJoin.DefaultIfEmpty() + join r in db.DriverInstanceResilienceStatuses.AsNoTracking() + on new { s.DriverInstanceId, s.HostName } equals new { r.DriverInstanceId, r.HostName } into resilJoin + from r in resilJoin.DefaultIfEmpty() orderby s.NodeId, s.DriverInstanceId, s.HostName select new HostStatusRow( s.NodeId, @@ -54,10 +70,21 @@ public sealed class HostStatusService(OtOpcUaConfigDbContext db) s.State, s.StateChangedUtc, s.LastSeenUtc, - s.Detail)).ToListAsync(ct); + s.Detail, + r != null ? r.ConsecutiveFailures : 0, + r != null ? r.LastCircuitBreakerOpenUtc : null, + r != null ? r.CurrentBulkheadDepth : 0, + r != null ? r.LastRecycleUtc : null)).ToListAsync(ct); return rows; } public static bool IsStale(HostStatusRow row) => DateTime.UtcNow - row.LastSeenUtc > StaleThreshold; + + /// + /// Red-badge predicate — true when the host has accumulated enough consecutive + /// failures that an operator should take notice before the breaker trips. + /// + public static bool IsFlagged(HostStatusRow row) => + row.ConsecutiveFailures >= FailureFlagThreshold; } diff --git a/src/ZB.MOM.WW.OtOpcUa.Core/Resilience/DriverResiliencePipelineBuilder.cs b/src/ZB.MOM.WW.OtOpcUa.Core/Resilience/DriverResiliencePipelineBuilder.cs index d7e25af..c1b095e 100644 --- a/src/ZB.MOM.WW.OtOpcUa.Core/Resilience/DriverResiliencePipelineBuilder.cs +++ b/src/ZB.MOM.WW.OtOpcUa.Core/Resilience/DriverResiliencePipelineBuilder.cs @@ -24,11 +24,21 @@ public sealed class DriverResiliencePipelineBuilder { private readonly ConcurrentDictionary _pipelines = new(); private readonly TimeProvider _timeProvider; + private readonly DriverResilienceStatusTracker? _statusTracker; /// Construct with the ambient clock (use in prod). - public DriverResiliencePipelineBuilder(TimeProvider? timeProvider = null) + /// Clock source for pipeline timeouts + breaker sampling. Defaults to system. + /// When non-null, every built pipeline wires Polly telemetry into + /// the tracker — retries increment ConsecutiveFailures, breaker-open stamps + /// LastBreakerOpenUtc, breaker-close resets failures. Feeds Admin /hosts + + /// the Polly bulkhead-depth column. Absent tracker means no telemetry (unit tests + + /// deployments that don't care about resilience observability). + public DriverResiliencePipelineBuilder( + TimeProvider? timeProvider = null, + DriverResilienceStatusTracker? statusTracker = null) { _timeProvider = timeProvider ?? TimeProvider.System; + _statusTracker = statusTracker; } /// @@ -54,8 +64,9 @@ public sealed class DriverResiliencePipelineBuilder ArgumentException.ThrowIfNullOrWhiteSpace(hostName); var key = new PipelineKey(driverInstanceId, hostName, capability); - return _pipelines.GetOrAdd(key, static (_, state) => Build(state.capability, state.options, state.timeProvider), - (capability, options, timeProvider: _timeProvider)); + return _pipelines.GetOrAdd(key, static (k, state) => Build( + k.DriverInstanceId, k.HostName, state.capability, state.options, state.timeProvider, state.tracker), + (capability, options, timeProvider: _timeProvider, tracker: _statusTracker)); } /// Drop cached pipelines for one driver instance (e.g. on ResilienceConfig change). Test + Admin-reload use. @@ -74,9 +85,12 @@ public sealed class DriverResiliencePipelineBuilder public int CachedPipelineCount => _pipelines.Count; private static ResiliencePipeline Build( + string driverInstanceId, + string hostName, DriverCapability capability, DriverResilienceOptions options, - TimeProvider timeProvider) + TimeProvider timeProvider, + DriverResilienceStatusTracker? tracker) { var policy = options.Resolve(capability); var builder = new ResiliencePipelineBuilder { TimeProvider = timeProvider }; @@ -88,7 +102,7 @@ public sealed class DriverResiliencePipelineBuilder if (policy.RetryCount > 0) { - builder.AddRetry(new RetryStrategyOptions + var retryOptions = new RetryStrategyOptions { MaxRetryAttempts = policy.RetryCount, BackoffType = DelayBackoffType.Exponential, @@ -96,19 +110,44 @@ public sealed class DriverResiliencePipelineBuilder Delay = TimeSpan.FromMilliseconds(100), MaxDelay = TimeSpan.FromSeconds(5), ShouldHandle = new PredicateBuilder().Handle(ex => ex is not OperationCanceledException), - }); + }; + if (tracker is not null) + { + retryOptions.OnRetry = args => + { + tracker.RecordFailure(driverInstanceId, hostName, timeProvider.GetUtcNow().UtcDateTime); + return default; + }; + } + builder.AddRetry(retryOptions); } if (policy.BreakerFailureThreshold > 0) { - builder.AddCircuitBreaker(new CircuitBreakerStrategyOptions + var breakerOptions = new CircuitBreakerStrategyOptions { FailureRatio = 1.0, MinimumThroughput = policy.BreakerFailureThreshold, SamplingDuration = TimeSpan.FromSeconds(30), BreakDuration = TimeSpan.FromSeconds(15), ShouldHandle = new PredicateBuilder().Handle(ex => ex is not OperationCanceledException), - }); + }; + if (tracker is not null) + { + breakerOptions.OnOpened = args => + { + tracker.RecordBreakerOpen(driverInstanceId, hostName, timeProvider.GetUtcNow().UtcDateTime); + return default; + }; + breakerOptions.OnClosed = args => + { + // Closing the breaker means the target recovered — reset the consecutive- + // failure counter so Admin UI stops flashing red for this host. + tracker.RecordSuccess(driverInstanceId, hostName, timeProvider.GetUtcNow().UtcDateTime); + return default; + }; + } + builder.AddCircuitBreaker(breakerOptions); } return builder.Build(); diff --git a/tests/ZB.MOM.WW.OtOpcUa.Core.Tests/Resilience/DriverResiliencePipelineBuilderTests.cs b/tests/ZB.MOM.WW.OtOpcUa.Core.Tests/Resilience/DriverResiliencePipelineBuilderTests.cs index 1167c5b..1d31808 100644 --- a/tests/ZB.MOM.WW.OtOpcUa.Core.Tests/Resilience/DriverResiliencePipelineBuilderTests.cs +++ b/tests/ZB.MOM.WW.OtOpcUa.Core.Tests/Resilience/DriverResiliencePipelineBuilderTests.cs @@ -219,4 +219,67 @@ public sealed class DriverResiliencePipelineBuilderTests attempts.ShouldBeLessThanOrEqualTo(1); } + + [Fact] + public async Task Tracker_RecordsFailure_OnEveryRetry() + { + var tracker = new DriverResilienceStatusTracker(); + var builder = new DriverResiliencePipelineBuilder(statusTracker: tracker); + var pipeline = builder.GetOrCreate("drv-trk", "host-x", DriverCapability.Read, TierAOptions); + + await Should.ThrowAsync(async () => + await pipeline.ExecuteAsync(async _ => + { + await Task.Yield(); + throw new InvalidOperationException("always fails"); + })); + + var snap = tracker.TryGet("drv-trk", "host-x"); + snap.ShouldNotBeNull(); + var retryCount = TierAOptions.Resolve(DriverCapability.Read).RetryCount; + snap!.ConsecutiveFailures.ShouldBe(retryCount); + } + + [Fact] + public async Task Tracker_StampsBreakerOpen_WhenBreakerTrips() + { + var tracker = new DriverResilienceStatusTracker(); + var builder = new DriverResiliencePipelineBuilder(statusTracker: tracker); + var pipeline = builder.GetOrCreate("drv-trk", "host-b", DriverCapability.Write, TierAOptions); + + var threshold = TierAOptions.Resolve(DriverCapability.Write).BreakerFailureThreshold; + for (var i = 0; i < threshold; i++) + { + await Should.ThrowAsync(async () => + await pipeline.ExecuteAsync(async _ => + { + await Task.Yield(); + throw new InvalidOperationException("boom"); + })); + } + + var snap = tracker.TryGet("drv-trk", "host-b"); + snap.ShouldNotBeNull(); + snap!.LastBreakerOpenUtc.ShouldNotBeNull(); + } + + [Fact] + public async Task Tracker_IsolatesCounters_PerHost() + { + var tracker = new DriverResilienceStatusTracker(); + var builder = new DriverResiliencePipelineBuilder(statusTracker: tracker); + var dead = builder.GetOrCreate("drv-trk", "dead", DriverCapability.Read, TierAOptions); + var live = builder.GetOrCreate("drv-trk", "live", DriverCapability.Read, TierAOptions); + + await Should.ThrowAsync(async () => + await dead.ExecuteAsync(async _ => + { + await Task.Yield(); + throw new InvalidOperationException("dead"); + })); + await live.ExecuteAsync(async _ => await Task.Yield()); + + tracker.TryGet("drv-trk", "dead")!.ConsecutiveFailures.ShouldBeGreaterThan(0); + tracker.TryGet("drv-trk", "live").ShouldBeNull(); + } }