From d06cc01a4836f836d507af680dc85a31c9dd5acb Mon Sep 17 00:00:00 2001 From: Joseph Doherty Date: Sun, 19 Apr 2026 21:35:54 -0400 Subject: [PATCH] =?UTF-8?q?Admin=20/hosts=20red-badge=20+=20resilience=20c?= =?UTF-8?q?olumns=20+=20Polly=20telemetry=20observer.=20Closes=20task=20#1?= =?UTF-8?q?64=20(the=20remaining=20slice=20of=20Phase=206.1=20Stream=20E.3?= =?UTF-8?q?=20after=20the=20earlier=20publisher=20+=20hub=20PR).=20Three?= =?UTF-8?q?=20cooperating=20pieces=20wired=20together=20so=20the=20operato?= =?UTF-8?q?r-facing=20/hosts=20table=20actually=20reflects=20the=20live=20?= =?UTF-8?q?Polly=20counters=20that=20the=20pipeline=20builder=20is=20produ?= =?UTF-8?q?cing.=20DriverResiliencePipelineBuilder=20gains=20an=20optional?= =?UTF-8?q?=20DriverResilienceStatusTracker=20ctor=20param=20=E2=80=94=20w?= =?UTF-8?q?hen=20non-null,=20every=20built=20pipeline=20wires=20Polly's=20?= =?UTF-8?q?OnRetry/OnOpened/OnClosed=20strategy-options=20callbacks=20into?= =?UTF-8?q?=20the=20tracker.=20OnRetry=20=E2=86=92=20tracker.RecordFailure?= =?UTF-8?q?=20(so=20ConsecutiveFailures=20climbs=20per=20retry),=20OnOpene?= =?UTF-8?q?d=20=E2=86=92=20tracker.RecordBreakerOpen=20(stamps=20LastCircu?= =?UTF-8?q?itBreakerOpenUtc),=20OnClosed=20=E2=86=92=20tracker.RecordSucce?= =?UTF-8?q?ss=20(resets=20the=20failure=20counter=20once=20the=20target=20?= =?UTF-8?q?recovers).=20Absent=20tracker=20=3D=20silent,=20preserving=20th?= =?UTF-8?q?e=20unit-test=20constructor=20path=20+=20any=20deployment=20tha?= =?UTF-8?q?t=20doesn't=20care=20about=20resilience=20observability.=20Canc?= =?UTF-8?q?ellation=20stays=20excluded=20from=20the=20failure=20count=20vi?= =?UTF-8?q?a=20the=20existing=20ShouldHandle=20predicate.=20HostStatusServ?= =?UTF-8?q?ice.HostStatusRow=20extends=20with=20four=20new=20fields=20?= =?UTF-8?q?=E2=80=94=20ConsecutiveFailures,=20LastCircuitBreakerOpenUtc,?= =?UTF-8?q?=20CurrentBulkheadDepth,=20LastRecycleUtc=20=E2=80=94=20populat?= =?UTF-8?q?ed=20via=20a=20second=20LEFT=20JOIN=20onto=20DriverInstanceResi?= =?UTF-8?q?lienceStatuses=20keyed=20on=20(DriverInstanceId,=20HostName).?= =?UTF-8?q?=20LEFT=20JOIN=20because=20brand-new=20hosts=20haven't=20been?= =?UTF-8?q?=20sampled=20yet;=20a=20missing=20row=20means=20zero=20failures?= =?UTF-8?q?=20+=20never-opened=20breaker,=20which=20is=20the=20correct=20d?= =?UTF-8?q?efault.=20New=20FailureFlagThreshold=20constant=20(=3D3,=20matc?= =?UTF-8?q?hes=20plan=20decision=20#143's=20conservative=20half-of-breaker?= =?UTF-8?q?=20convention)=20+=20IsFlagged=20predicate=20so=20the=20UI=20ca?= =?UTF-8?q?n=20pre-warn=20before=20the=20breaker=20actually=20trips.=20Hos?= =?UTF-8?q?ts.razor=20paints=20three=20new=20columns=20between=20State=20a?= =?UTF-8?q?nd=20Last-transition=20=E2=80=94=20Fail#=20(bold=20red=20when?= =?UTF-8?q?=20flagged),=20In-flight=20(bulkhead-depth=20proxy),=20Breaker-?= =?UTF-8?q?opened=20(relative=20age).=20Per-row=20"Flagged"=20red=20badge?= =?UTF-8?q?=20alongside=20State=20when=20IsFlagged=20is=20true.=20Above=20?= =?UTF-8?q?the=20first=20cluster=20table,=20a=20red=20alert=20banner=20sum?= =?UTF-8?q?marises=20the=20flagged-host=20count=20when=20=E2=89=A51=20host?= =?UTF-8?q?=20is=20flagged,=20so=20operators=20see=20the=20problem=20befor?= =?UTF-8?q?e=20scanning=20rows.=20Three=20new=20tests=20in=20DriverResilie?= =?UTF-8?q?ncePipelineBuilderTests=20=E2=80=94=20Tracker=5FRecordsFailure?= =?UTF-8?q?=5FOnEveryRetry=20verifies=20ConsecutiveFailures=20reaches=20Re?= =?UTF-8?q?tryCount=20after=20a=20transient-forever=20operation,=20Tracker?= =?UTF-8?q?=5FStampsBreakerOpen=5FWhenBreakerTrips=20verifies=20LastBreake?= =?UTF-8?q?rOpenUtc=20is=20set=20after=20threshold=20failures=20on=20a=20W?= =?UTF-8?q?rite=20pipeline,=20Tracker=5FIsolatesCounters=5FPerHost=20verif?= =?UTF-8?q?ies=20one=20dead=20host=20does=20not=20leak=20failure=20counts?= =?UTF-8?q?=20into=20a=20healthy=20sibling.=20Full=20suite=20=E2=80=94=20C?= =?UTF-8?q?ore.Tests=2014/14=20resilience-builder=20tests=20passing=20(11?= =?UTF-8?q?=20existing=20+=203=20new),=20Admin.Tests=2072/72=20passing,=20?= =?UTF-8?q?Admin=20project=20builds=200=20errors.=20SignalR=20live=20push?= =?UTF-8?q?=20of=20status=20changes=20+=20browser=20visual=20review=20are?= =?UTF-8?q?=20deliberately=20left=20to=20a=20follow-up=20=E2=80=94=20this?= =?UTF-8?q?=20PR=20keeps=20the=20structural=20change=20minimal=20(polling?= =?UTF-8?q?=20refresh=20already=20exists=20in=20the=20page's=2010s=20timer?= =?UTF-8?q?;=20SignalR=20would=20be=20a=20structural=20add=20that=20touche?= =?UTF-8?q?s=20hub=20registration=20+=20client=20subscription).?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Opus 4.7 (1M context) --- .../Components/Pages/Hosts.razor | 26 +++++++- .../Services/HostStatusService.cs | 41 +++++++++--- .../DriverResiliencePipelineBuilder.cs | 55 +++++++++++++--- .../DriverResiliencePipelineBuilderTests.cs | 63 +++++++++++++++++++ 4 files changed, 169 insertions(+), 16 deletions(-) diff --git a/src/ZB.MOM.WW.OtOpcUa.Admin/Components/Pages/Hosts.razor b/src/ZB.MOM.WW.OtOpcUa.Admin/Components/Pages/Hosts.razor index d6a3d0a..c916834 100644 --- a/src/ZB.MOM.WW.OtOpcUa.Admin/Components/Pages/Hosts.razor +++ b/src/ZB.MOM.WW.OtOpcUa.Admin/Components/Pages/Hosts.razor @@ -56,6 +56,16 @@ else + @if (_rows.Any(HostStatusService.IsFlagged)) + { + var flaggedCount = _rows.Count(HostStatusService.IsFlagged); +
+ @flaggedCount host@(flaggedCount == 1 ? "" : "s") + reporting ≥ @HostStatusService.FailureFlagThreshold consecutive failures — circuit breaker + may trip soon. Inspect the resilience columns below to locate. +
+ } + @foreach (var cluster in _rows.GroupBy(r => r.ClusterId ?? "(unassigned)").OrderBy(g => g.Key)) {

Cluster: @cluster.Key

@@ -66,6 +76,9 @@ else Driver Host State + Fail# + In-flight + Breaker opened Last transition Last seen Detail @@ -84,10 +97,21 @@ else { Stale } + @if (HostStatusService.IsFlagged(r)) + { + Flagged + } + + + @r.ConsecutiveFailures + + @r.CurrentBulkheadDepth + + @(r.LastCircuitBreakerOpenUtc is null ? "—" : FormatAge(r.LastCircuitBreakerOpenUtc.Value)) @FormatAge(r.StateChangedUtc) @FormatAge(r.LastSeenUtc) - @r.Detail + @r.Detail } diff --git a/src/ZB.MOM.WW.OtOpcUa.Admin/Services/HostStatusService.cs b/src/ZB.MOM.WW.OtOpcUa.Admin/Services/HostStatusService.cs index fcd8ea3..f0a7881 100644 --- a/src/ZB.MOM.WW.OtOpcUa.Admin/Services/HostStatusService.cs +++ b/src/ZB.MOM.WW.OtOpcUa.Admin/Services/HostStatusService.cs @@ -7,8 +7,9 @@ namespace ZB.MOM.WW.OtOpcUa.Admin.Services; /// /// One row per record, enriched with the owning -/// ClusterNode.ClusterId when available (left-join). The Admin /hosts page -/// groups by cluster and renders a per-node → per-driver → per-host tree. +/// ClusterNode.ClusterId (left-join) + the per-(DriverInstanceId, HostName) +/// counters (also left-join) so the Admin +/// /hosts page renders the resilience surface inline with host state. /// public sealed record HostStatusRow( string NodeId, @@ -18,7 +19,11 @@ public sealed record HostStatusRow( DriverHostState State, DateTime StateChangedUtc, DateTime LastSeenUtc, - string? Detail); + string? Detail, + int ConsecutiveFailures, + DateTime? LastCircuitBreakerOpenUtc, + int CurrentBulkheadDepth, + DateTime? LastRecycleUtc); /// /// Read-side service for the Admin UI's per-host drill-down. Loads @@ -36,15 +41,26 @@ public sealed class HostStatusService(OtOpcUaConfigDbContext db) { public static readonly TimeSpan StaleThreshold = TimeSpan.FromSeconds(30); + /// Consecutive-failure threshold at which returns true + /// so the Admin UI can paint a red badge. Matches Phase 6.1 decision #143's conservative + /// half-of-breaker-threshold convention — flags before the breaker actually opens. + public const int FailureFlagThreshold = 3; + public async Task> ListAsync(CancellationToken ct = default) { - // LEFT JOIN on NodeId so a row persists even when its owning ClusterNode row hasn't - // been created yet (first-boot bootstrap case — keeps the UI from losing sight of - // the reporting server). + // Two LEFT JOINs: + // 1. ClusterNodes on NodeId — row persists even when its owning ClusterNode row + // hasn't been created yet (first-boot bootstrap case). + // 2. DriverInstanceResilienceStatuses on (DriverInstanceId, HostName) — resilience + // counters haven't been sampled yet for brand-new hosts, so a missing row means + // zero failures + never-opened breaker. var rows = await (from s in db.DriverHostStatuses.AsNoTracking() join n in db.ClusterNodes.AsNoTracking() on s.NodeId equals n.NodeId into nodeJoin from n in nodeJoin.DefaultIfEmpty() + join r in db.DriverInstanceResilienceStatuses.AsNoTracking() + on new { s.DriverInstanceId, s.HostName } equals new { r.DriverInstanceId, r.HostName } into resilJoin + from r in resilJoin.DefaultIfEmpty() orderby s.NodeId, s.DriverInstanceId, s.HostName select new HostStatusRow( s.NodeId, @@ -54,10 +70,21 @@ public sealed class HostStatusService(OtOpcUaConfigDbContext db) s.State, s.StateChangedUtc, s.LastSeenUtc, - s.Detail)).ToListAsync(ct); + s.Detail, + r != null ? r.ConsecutiveFailures : 0, + r != null ? r.LastCircuitBreakerOpenUtc : null, + r != null ? r.CurrentBulkheadDepth : 0, + r != null ? r.LastRecycleUtc : null)).ToListAsync(ct); return rows; } public static bool IsStale(HostStatusRow row) => DateTime.UtcNow - row.LastSeenUtc > StaleThreshold; + + /// + /// Red-badge predicate — true when the host has accumulated enough consecutive + /// failures that an operator should take notice before the breaker trips. + /// + public static bool IsFlagged(HostStatusRow row) => + row.ConsecutiveFailures >= FailureFlagThreshold; } diff --git a/src/ZB.MOM.WW.OtOpcUa.Core/Resilience/DriverResiliencePipelineBuilder.cs b/src/ZB.MOM.WW.OtOpcUa.Core/Resilience/DriverResiliencePipelineBuilder.cs index d7e25af..c1b095e 100644 --- a/src/ZB.MOM.WW.OtOpcUa.Core/Resilience/DriverResiliencePipelineBuilder.cs +++ b/src/ZB.MOM.WW.OtOpcUa.Core/Resilience/DriverResiliencePipelineBuilder.cs @@ -24,11 +24,21 @@ public sealed class DriverResiliencePipelineBuilder { private readonly ConcurrentDictionary _pipelines = new(); private readonly TimeProvider _timeProvider; + private readonly DriverResilienceStatusTracker? _statusTracker; /// Construct with the ambient clock (use in prod). - public DriverResiliencePipelineBuilder(TimeProvider? timeProvider = null) + /// Clock source for pipeline timeouts + breaker sampling. Defaults to system. + /// When non-null, every built pipeline wires Polly telemetry into + /// the tracker — retries increment ConsecutiveFailures, breaker-open stamps + /// LastBreakerOpenUtc, breaker-close resets failures. Feeds Admin /hosts + + /// the Polly bulkhead-depth column. Absent tracker means no telemetry (unit tests + + /// deployments that don't care about resilience observability). + public DriverResiliencePipelineBuilder( + TimeProvider? timeProvider = null, + DriverResilienceStatusTracker? statusTracker = null) { _timeProvider = timeProvider ?? TimeProvider.System; + _statusTracker = statusTracker; } /// @@ -54,8 +64,9 @@ public sealed class DriverResiliencePipelineBuilder ArgumentException.ThrowIfNullOrWhiteSpace(hostName); var key = new PipelineKey(driverInstanceId, hostName, capability); - return _pipelines.GetOrAdd(key, static (_, state) => Build(state.capability, state.options, state.timeProvider), - (capability, options, timeProvider: _timeProvider)); + return _pipelines.GetOrAdd(key, static (k, state) => Build( + k.DriverInstanceId, k.HostName, state.capability, state.options, state.timeProvider, state.tracker), + (capability, options, timeProvider: _timeProvider, tracker: _statusTracker)); } /// Drop cached pipelines for one driver instance (e.g. on ResilienceConfig change). Test + Admin-reload use. @@ -74,9 +85,12 @@ public sealed class DriverResiliencePipelineBuilder public int CachedPipelineCount => _pipelines.Count; private static ResiliencePipeline Build( + string driverInstanceId, + string hostName, DriverCapability capability, DriverResilienceOptions options, - TimeProvider timeProvider) + TimeProvider timeProvider, + DriverResilienceStatusTracker? tracker) { var policy = options.Resolve(capability); var builder = new ResiliencePipelineBuilder { TimeProvider = timeProvider }; @@ -88,7 +102,7 @@ public sealed class DriverResiliencePipelineBuilder if (policy.RetryCount > 0) { - builder.AddRetry(new RetryStrategyOptions + var retryOptions = new RetryStrategyOptions { MaxRetryAttempts = policy.RetryCount, BackoffType = DelayBackoffType.Exponential, @@ -96,19 +110,44 @@ public sealed class DriverResiliencePipelineBuilder Delay = TimeSpan.FromMilliseconds(100), MaxDelay = TimeSpan.FromSeconds(5), ShouldHandle = new PredicateBuilder().Handle(ex => ex is not OperationCanceledException), - }); + }; + if (tracker is not null) + { + retryOptions.OnRetry = args => + { + tracker.RecordFailure(driverInstanceId, hostName, timeProvider.GetUtcNow().UtcDateTime); + return default; + }; + } + builder.AddRetry(retryOptions); } if (policy.BreakerFailureThreshold > 0) { - builder.AddCircuitBreaker(new CircuitBreakerStrategyOptions + var breakerOptions = new CircuitBreakerStrategyOptions { FailureRatio = 1.0, MinimumThroughput = policy.BreakerFailureThreshold, SamplingDuration = TimeSpan.FromSeconds(30), BreakDuration = TimeSpan.FromSeconds(15), ShouldHandle = new PredicateBuilder().Handle(ex => ex is not OperationCanceledException), - }); + }; + if (tracker is not null) + { + breakerOptions.OnOpened = args => + { + tracker.RecordBreakerOpen(driverInstanceId, hostName, timeProvider.GetUtcNow().UtcDateTime); + return default; + }; + breakerOptions.OnClosed = args => + { + // Closing the breaker means the target recovered — reset the consecutive- + // failure counter so Admin UI stops flashing red for this host. + tracker.RecordSuccess(driverInstanceId, hostName, timeProvider.GetUtcNow().UtcDateTime); + return default; + }; + } + builder.AddCircuitBreaker(breakerOptions); } return builder.Build(); diff --git a/tests/ZB.MOM.WW.OtOpcUa.Core.Tests/Resilience/DriverResiliencePipelineBuilderTests.cs b/tests/ZB.MOM.WW.OtOpcUa.Core.Tests/Resilience/DriverResiliencePipelineBuilderTests.cs index 1167c5b..1d31808 100644 --- a/tests/ZB.MOM.WW.OtOpcUa.Core.Tests/Resilience/DriverResiliencePipelineBuilderTests.cs +++ b/tests/ZB.MOM.WW.OtOpcUa.Core.Tests/Resilience/DriverResiliencePipelineBuilderTests.cs @@ -219,4 +219,67 @@ public sealed class DriverResiliencePipelineBuilderTests attempts.ShouldBeLessThanOrEqualTo(1); } + + [Fact] + public async Task Tracker_RecordsFailure_OnEveryRetry() + { + var tracker = new DriverResilienceStatusTracker(); + var builder = new DriverResiliencePipelineBuilder(statusTracker: tracker); + var pipeline = builder.GetOrCreate("drv-trk", "host-x", DriverCapability.Read, TierAOptions); + + await Should.ThrowAsync(async () => + await pipeline.ExecuteAsync(async _ => + { + await Task.Yield(); + throw new InvalidOperationException("always fails"); + })); + + var snap = tracker.TryGet("drv-trk", "host-x"); + snap.ShouldNotBeNull(); + var retryCount = TierAOptions.Resolve(DriverCapability.Read).RetryCount; + snap!.ConsecutiveFailures.ShouldBe(retryCount); + } + + [Fact] + public async Task Tracker_StampsBreakerOpen_WhenBreakerTrips() + { + var tracker = new DriverResilienceStatusTracker(); + var builder = new DriverResiliencePipelineBuilder(statusTracker: tracker); + var pipeline = builder.GetOrCreate("drv-trk", "host-b", DriverCapability.Write, TierAOptions); + + var threshold = TierAOptions.Resolve(DriverCapability.Write).BreakerFailureThreshold; + for (var i = 0; i < threshold; i++) + { + await Should.ThrowAsync(async () => + await pipeline.ExecuteAsync(async _ => + { + await Task.Yield(); + throw new InvalidOperationException("boom"); + })); + } + + var snap = tracker.TryGet("drv-trk", "host-b"); + snap.ShouldNotBeNull(); + snap!.LastBreakerOpenUtc.ShouldNotBeNull(); + } + + [Fact] + public async Task Tracker_IsolatesCounters_PerHost() + { + var tracker = new DriverResilienceStatusTracker(); + var builder = new DriverResiliencePipelineBuilder(statusTracker: tracker); + var dead = builder.GetOrCreate("drv-trk", "dead", DriverCapability.Read, TierAOptions); + var live = builder.GetOrCreate("drv-trk", "live", DriverCapability.Read, TierAOptions); + + await Should.ThrowAsync(async () => + await dead.ExecuteAsync(async _ => + { + await Task.Yield(); + throw new InvalidOperationException("dead"); + })); + await live.ExecuteAsync(async _ => await Task.Yield()); + + tracker.TryGet("drv-trk", "dead")!.ConsecutiveFailures.ShouldBeGreaterThan(0); + tracker.TryGet("drv-trk", "live").ShouldBeNull(); + } }