Closes the observer half of #162 that was flagged as "persisted as 0 today" in PR #105. The Admin /hosts column refresh + FleetStatusHub SignalR push + red-badge visual still belong to the visual-compliance pass. Core.Resilience: - DriverResilienceStatusTracker gains RecordCallStart + RecordCallComplete + CurrentInFlight field on the snapshot record. Concurrent-safe via the same ConcurrentDictionary.AddOrUpdate pattern as the other recorder methods. Clamps to zero on over-decrement so a stray Complete-without-Start can't drive the counter negative. - CapabilityInvoker gains an optional statusTracker ctor parameter. When wired, every ExecuteAsync / ExecuteAsync(void) wraps the pipeline call in try / finally that records start/complete — so the counter advances cleanly whether the call succeeds, cancels, or throws. Null tracker keeps the pre-Phase-6.1 Stream E.3 behaviour exactly. Server.Hosting: - ResilienceStatusPublisherHostedService persists CurrentInFlight as the DriverInstanceResilienceStatus.CurrentBulkheadDepth column (was 0 before this PR). One-line fix on both the insert + update branches. The in-flight counter is a pragmatic proxy for Polly's internal bulkhead depth — a future PR wiring Polly telemetry would replace it with the real value. The shape of the column + the publisher + the Admin /hosts query doesn't change, so the follow-up is invisible to consumers. Tests (8 new InFlightCounterTests, all pass): - Start+Complete nets to zero. - Nested starts sum; Complete decrements. - Complete-without-Start clamps to zero. - Different hosts track independently. - Concurrent starts (500 parallel) don't lose count. - CapabilityInvoker observed-mid-call depth == 1 during a pending call. - CapabilityInvoker exception path still decrements (try/finally). - CapabilityInvoker without tracker doesn't throw. Full solution dotnet test: 1243 passing (was 1235, +8). Pre-existing Client.CLI Subscribe flake unchanged. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
136 lines
6.5 KiB
C#
136 lines
6.5 KiB
C#
using System.Collections.Concurrent;
|
|
|
|
namespace ZB.MOM.WW.OtOpcUa.Core.Resilience;
|
|
|
|
/// <summary>
|
|
/// Process-singleton tracker of live resilience counters per
|
|
/// <c>(DriverInstanceId, HostName)</c>. Populated by the CapabilityInvoker and the
|
|
/// MemoryTracking layer; consumed by a HostedService that periodically persists a
|
|
/// snapshot to the <c>DriverInstanceResilienceStatus</c> table for Admin <c>/hosts</c>.
|
|
/// </summary>
|
|
/// <remarks>
|
|
/// Per Phase 6.1 Stream E. No DB dependency here — the tracker is pure in-memory so
|
|
/// tests can exercise it without EF Core or SQL Server. The HostedService that writes
|
|
/// snapshots lives in the Server project (Stream E.2); the actual SignalR push + Blazor
|
|
/// page refresh (E.3) lands in a follow-up visual-review PR.
|
|
/// </remarks>
|
|
public sealed class DriverResilienceStatusTracker
|
|
{
|
|
private readonly ConcurrentDictionary<StatusKey, ResilienceStatusSnapshot> _status = new();
|
|
|
|
/// <summary>Record a Polly pipeline failure for <paramref name="hostName"/>.</summary>
|
|
public void RecordFailure(string driverInstanceId, string hostName, DateTime utcNow)
|
|
{
|
|
var key = new StatusKey(driverInstanceId, hostName);
|
|
_status.AddOrUpdate(key,
|
|
_ => new ResilienceStatusSnapshot { ConsecutiveFailures = 1, LastSampledUtc = utcNow },
|
|
(_, existing) => existing with
|
|
{
|
|
ConsecutiveFailures = existing.ConsecutiveFailures + 1,
|
|
LastSampledUtc = utcNow,
|
|
});
|
|
}
|
|
|
|
/// <summary>Reset the consecutive-failure count on a successful pipeline execution.</summary>
|
|
public void RecordSuccess(string driverInstanceId, string hostName, DateTime utcNow)
|
|
{
|
|
var key = new StatusKey(driverInstanceId, hostName);
|
|
_status.AddOrUpdate(key,
|
|
_ => new ResilienceStatusSnapshot { ConsecutiveFailures = 0, LastSampledUtc = utcNow },
|
|
(_, existing) => existing with
|
|
{
|
|
ConsecutiveFailures = 0,
|
|
LastSampledUtc = utcNow,
|
|
});
|
|
}
|
|
|
|
/// <summary>Record a circuit-breaker open event.</summary>
|
|
public void RecordBreakerOpen(string driverInstanceId, string hostName, DateTime utcNow)
|
|
{
|
|
var key = new StatusKey(driverInstanceId, hostName);
|
|
_status.AddOrUpdate(key,
|
|
_ => new ResilienceStatusSnapshot { LastBreakerOpenUtc = utcNow, LastSampledUtc = utcNow },
|
|
(_, existing) => existing with { LastBreakerOpenUtc = utcNow, LastSampledUtc = utcNow });
|
|
}
|
|
|
|
/// <summary>Record a process recycle event (Tier C only).</summary>
|
|
public void RecordRecycle(string driverInstanceId, string hostName, DateTime utcNow)
|
|
{
|
|
var key = new StatusKey(driverInstanceId, hostName);
|
|
_status.AddOrUpdate(key,
|
|
_ => new ResilienceStatusSnapshot { LastRecycleUtc = utcNow, LastSampledUtc = utcNow },
|
|
(_, existing) => existing with { LastRecycleUtc = utcNow, LastSampledUtc = utcNow });
|
|
}
|
|
|
|
/// <summary>Capture / update the MemoryTracking-supplied baseline + current footprint.</summary>
|
|
public void RecordFootprint(string driverInstanceId, string hostName, long baselineBytes, long currentBytes, DateTime utcNow)
|
|
{
|
|
var key = new StatusKey(driverInstanceId, hostName);
|
|
_status.AddOrUpdate(key,
|
|
_ => new ResilienceStatusSnapshot
|
|
{
|
|
BaselineFootprintBytes = baselineBytes,
|
|
CurrentFootprintBytes = currentBytes,
|
|
LastSampledUtc = utcNow,
|
|
},
|
|
(_, existing) => existing with
|
|
{
|
|
BaselineFootprintBytes = baselineBytes,
|
|
CurrentFootprintBytes = currentBytes,
|
|
LastSampledUtc = utcNow,
|
|
});
|
|
}
|
|
|
|
/// <summary>
|
|
/// Record the entry of a capability call for this (instance, host). Increments the
|
|
/// in-flight counter used as the <see cref="ResilienceStatusSnapshot.CurrentInFlight"/>
|
|
/// surface (a cheap stand-in for Polly bulkhead depth). Paired with
|
|
/// <see cref="RecordCallComplete"/>; callers use try/finally.
|
|
/// </summary>
|
|
public void RecordCallStart(string driverInstanceId, string hostName)
|
|
{
|
|
var key = new StatusKey(driverInstanceId, hostName);
|
|
_status.AddOrUpdate(key,
|
|
_ => new ResilienceStatusSnapshot { CurrentInFlight = 1 },
|
|
(_, existing) => existing with { CurrentInFlight = existing.CurrentInFlight + 1 });
|
|
}
|
|
|
|
/// <summary>Paired with <see cref="RecordCallStart"/> — decrements the in-flight counter.</summary>
|
|
public void RecordCallComplete(string driverInstanceId, string hostName)
|
|
{
|
|
var key = new StatusKey(driverInstanceId, hostName);
|
|
_status.AddOrUpdate(key,
|
|
_ => new ResilienceStatusSnapshot { CurrentInFlight = 0 }, // start-without-complete shouldn't happen; clamp to 0
|
|
(_, existing) => existing with { CurrentInFlight = Math.Max(0, existing.CurrentInFlight - 1) });
|
|
}
|
|
|
|
/// <summary>Snapshot of a specific (instance, host) pair; null if no counters recorded yet.</summary>
|
|
public ResilienceStatusSnapshot? TryGet(string driverInstanceId, string hostName) =>
|
|
_status.TryGetValue(new StatusKey(driverInstanceId, hostName), out var snapshot) ? snapshot : null;
|
|
|
|
/// <summary>Copy of every currently-tracked (instance, host, snapshot) triple. Safe under concurrent writes.</summary>
|
|
public IReadOnlyList<(string DriverInstanceId, string HostName, ResilienceStatusSnapshot Snapshot)> Snapshot() =>
|
|
_status.Select(kvp => (kvp.Key.DriverInstanceId, kvp.Key.HostName, kvp.Value)).ToList();
|
|
|
|
private readonly record struct StatusKey(string DriverInstanceId, string HostName);
|
|
}
|
|
|
|
/// <summary>Snapshot of the resilience counters for one <c>(DriverInstanceId, HostName)</c> pair.</summary>
|
|
public sealed record ResilienceStatusSnapshot
|
|
{
|
|
public int ConsecutiveFailures { get; init; }
|
|
public DateTime? LastBreakerOpenUtc { get; init; }
|
|
public DateTime? LastRecycleUtc { get; init; }
|
|
public long BaselineFootprintBytes { get; init; }
|
|
public long CurrentFootprintBytes { get; init; }
|
|
public DateTime LastSampledUtc { get; init; }
|
|
|
|
/// <summary>
|
|
/// In-flight capability calls against this (instance, host). Bumped on call entry +
|
|
/// decremented on completion. Feeds <c>DriverInstanceResilienceStatus.CurrentBulkheadDepth</c>
|
|
/// for Admin <c>/hosts</c> — a cheap proxy for the Polly bulkhead depth until the full
|
|
/// telemetry observer lands.
|
|
/// </summary>
|
|
public int CurrentInFlight { get; init; }
|
|
}
|