Closes the observer half of #162 that was flagged as "persisted as 0 today" in PR #105. The Admin /hosts column refresh + FleetStatusHub SignalR push + red-badge visual still belong to the visual-compliance pass. Core.Resilience: - DriverResilienceStatusTracker gains RecordCallStart + RecordCallComplete + CurrentInFlight field on the snapshot record. Concurrent-safe via the same ConcurrentDictionary.AddOrUpdate pattern as the other recorder methods. Clamps to zero on over-decrement so a stray Complete-without-Start can't drive the counter negative. - CapabilityInvoker gains an optional statusTracker ctor parameter. When wired, every ExecuteAsync / ExecuteAsync(void) wraps the pipeline call in try / finally that records start/complete — so the counter advances cleanly whether the call succeeds, cancels, or throws. Null tracker keeps the pre-Phase-6.1 Stream E.3 behaviour exactly. Server.Hosting: - ResilienceStatusPublisherHostedService persists CurrentInFlight as the DriverInstanceResilienceStatus.CurrentBulkheadDepth column (was 0 before this PR). One-line fix on both the insert + update branches. The in-flight counter is a pragmatic proxy for Polly's internal bulkhead depth — a future PR wiring Polly telemetry would replace it with the real value. The shape of the column + the publisher + the Admin /hosts query doesn't change, so the follow-up is invisible to consumers. Tests (8 new InFlightCounterTests, all pass): - Start+Complete nets to zero. - Nested starts sum; Complete decrements. - Complete-without-Start clamps to zero. - Different hosts track independently. - Concurrent starts (500 parallel) don't lose count. - CapabilityInvoker observed-mid-call depth == 1 during a pending call. - CapabilityInvoker exception path still decrements (try/finally). - CapabilityInvoker without tracker doesn't throw. Full solution dotnet test: 1243 passing (was 1235, +8). Pre-existing Client.CLI Subscribe flake unchanged. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
140 lines
6.2 KiB
C#
140 lines
6.2 KiB
C#
using Microsoft.EntityFrameworkCore;
|
|
using Microsoft.Extensions.Hosting;
|
|
using Microsoft.Extensions.Logging;
|
|
using ZB.MOM.WW.OtOpcUa.Configuration;
|
|
using ZB.MOM.WW.OtOpcUa.Configuration.Entities;
|
|
using ZB.MOM.WW.OtOpcUa.Core.Resilience;
|
|
|
|
namespace ZB.MOM.WW.OtOpcUa.Server.Hosting;
|
|
|
|
/// <summary>
|
|
/// Samples <see cref="DriverResilienceStatusTracker"/> at a fixed tick + upserts each
|
|
/// <c>(DriverInstanceId, HostName)</c> snapshot into <see cref="DriverInstanceResilienceStatus"/>
|
|
/// so Admin <c>/hosts</c> can render live resilience counters across restarts.
|
|
/// </summary>
|
|
/// <remarks>
|
|
/// <para>Closes the HostedService piece of Phase 6.1 Stream E.2 flagged as a follow-up
|
|
/// when the tracker shipped in PR #82. The Admin UI column-refresh piece (red badge when
|
|
/// ConsecutiveFailures > breakerThreshold / 2 + SignalR push) is still deferred to
|
|
/// the visual-compliance pass — this service owns the persistence half alone.</para>
|
|
///
|
|
/// <para>Tick interval defaults to 5 s. Persistence is best-effort: a DB outage during
|
|
/// a tick logs + continues; the next tick tries again with the latest snapshots. The
|
|
/// hosted service never crashes the app on sample failure.</para>
|
|
///
|
|
/// <para><see cref="PersistOnceAsync"/> factored as a public method so tests can drive
|
|
/// it directly, matching the <see cref="ScheduledRecycleHostedService.TickOnceAsync"/>
|
|
/// pattern for deterministic unit-test timing.</para>
|
|
/// </remarks>
|
|
public sealed class ResilienceStatusPublisherHostedService : BackgroundService
|
|
{
|
|
private readonly DriverResilienceStatusTracker _tracker;
|
|
private readonly IDbContextFactory<OtOpcUaConfigDbContext> _dbContextFactory;
|
|
private readonly ILogger<ResilienceStatusPublisherHostedService> _logger;
|
|
private readonly TimeProvider _timeProvider;
|
|
|
|
/// <summary>Tick interval — how often the tracker snapshot is persisted.</summary>
|
|
public TimeSpan TickInterval { get; }
|
|
|
|
/// <summary>Snapshot of the tick count for diagnostics + test assertions.</summary>
|
|
public int TickCount { get; private set; }
|
|
|
|
public ResilienceStatusPublisherHostedService(
|
|
DriverResilienceStatusTracker tracker,
|
|
IDbContextFactory<OtOpcUaConfigDbContext> dbContextFactory,
|
|
ILogger<ResilienceStatusPublisherHostedService> logger,
|
|
TimeProvider? timeProvider = null,
|
|
TimeSpan? tickInterval = null)
|
|
{
|
|
ArgumentNullException.ThrowIfNull(tracker);
|
|
ArgumentNullException.ThrowIfNull(dbContextFactory);
|
|
|
|
_tracker = tracker;
|
|
_dbContextFactory = dbContextFactory;
|
|
_logger = logger;
|
|
_timeProvider = timeProvider ?? TimeProvider.System;
|
|
TickInterval = tickInterval ?? TimeSpan.FromSeconds(5);
|
|
}
|
|
|
|
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
|
|
{
|
|
_logger.LogInformation(
|
|
"ResilienceStatusPublisherHostedService starting — tick interval = {Interval}",
|
|
TickInterval);
|
|
|
|
while (!stoppingToken.IsCancellationRequested)
|
|
{
|
|
try
|
|
{
|
|
await Task.Delay(TickInterval, _timeProvider, stoppingToken).ConfigureAwait(false);
|
|
}
|
|
catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested)
|
|
{
|
|
break;
|
|
}
|
|
|
|
await PersistOnceAsync(stoppingToken).ConfigureAwait(false);
|
|
}
|
|
|
|
_logger.LogInformation("ResilienceStatusPublisherHostedService stopping after {TickCount} tick(s).", TickCount);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Take one snapshot of the tracker + upsert each pair into the persistence table.
|
|
/// Swallows transient exceptions + logs them; never throws from a sample failure.
|
|
/// </summary>
|
|
public async Task PersistOnceAsync(CancellationToken cancellationToken)
|
|
{
|
|
TickCount++;
|
|
var snapshot = _tracker.Snapshot();
|
|
if (snapshot.Count == 0) return;
|
|
|
|
try
|
|
{
|
|
await using var db = await _dbContextFactory.CreateDbContextAsync(cancellationToken).ConfigureAwait(false);
|
|
var now = _timeProvider.GetUtcNow().UtcDateTime;
|
|
|
|
foreach (var (driverInstanceId, hostName, counters) in snapshot)
|
|
{
|
|
var existing = await db.DriverInstanceResilienceStatuses
|
|
.FirstOrDefaultAsync(x => x.DriverInstanceId == driverInstanceId && x.HostName == hostName, cancellationToken)
|
|
.ConfigureAwait(false);
|
|
|
|
if (existing is null)
|
|
{
|
|
db.DriverInstanceResilienceStatuses.Add(new DriverInstanceResilienceStatus
|
|
{
|
|
DriverInstanceId = driverInstanceId,
|
|
HostName = hostName,
|
|
LastCircuitBreakerOpenUtc = counters.LastBreakerOpenUtc,
|
|
ConsecutiveFailures = counters.ConsecutiveFailures,
|
|
CurrentBulkheadDepth = counters.CurrentInFlight,
|
|
LastRecycleUtc = counters.LastRecycleUtc,
|
|
BaselineFootprintBytes = counters.BaselineFootprintBytes,
|
|
CurrentFootprintBytes = counters.CurrentFootprintBytes,
|
|
LastSampledUtc = now,
|
|
});
|
|
}
|
|
else
|
|
{
|
|
existing.LastCircuitBreakerOpenUtc = counters.LastBreakerOpenUtc;
|
|
existing.ConsecutiveFailures = counters.ConsecutiveFailures;
|
|
existing.CurrentBulkheadDepth = counters.CurrentInFlight;
|
|
existing.LastRecycleUtc = counters.LastRecycleUtc;
|
|
existing.BaselineFootprintBytes = counters.BaselineFootprintBytes;
|
|
existing.CurrentFootprintBytes = counters.CurrentFootprintBytes;
|
|
existing.LastSampledUtc = now;
|
|
}
|
|
}
|
|
|
|
await db.SaveChangesAsync(cancellationToken).ConfigureAwait(false);
|
|
}
|
|
catch (OperationCanceledException) { throw; }
|
|
catch (Exception ex)
|
|
{
|
|
_logger.LogWarning(ex,
|
|
"ResilienceStatusPublisher persistence tick failed; next tick will retry with latest snapshots.");
|
|
}
|
|
}
|
|
}
|