Phase 6.1 Stream E.2 partial — ResilienceStatusPublisherHostedService persists tracker snapshots to DB
Closes the HostedService half of Phase 6.1 Stream E.2 flagged as a follow-up when the DriverResilienceStatusTracker shipped in PR #82. The Admin /hosts column refresh + SignalR push + red-badge visual (Stream E.3) remain deferred to the visual-compliance pass — this PR owns the persistence story alone. Server.Hosting: - ResilienceStatusPublisherHostedService : BackgroundService. Samples the DriverResilienceStatusTracker every TickInterval (default 5 s) and upserts each (DriverInstanceId, HostName) counter pair into DriverInstanceResilienceStatus via EF. New rows on first sight; in-place updates on subsequent ticks. - PersistOnceAsync extracted public so tests drive one tick directly — matches the ScheduledRecycleHostedService pattern for deterministic timing. - Best-effort persistence: a DB outage logs a warning + continues; the next tick retries. Never crashes the app on sample failure. Cancellation propagates through cleanly. - Tracks the bulkhead depth / recycle / footprint columns the entity was designed for. CurrentBulkheadDepth currently persisted as 0 — the tracker doesn't yet expose live bulkhead depth; a narrower follow-up wires the Polly bulkhead-depth observer into the tracker. Tests (6 new in ResilienceStatusPublisherHostedServiceTests): - Empty tracker → tick is a no-op, zero rows written. - Single-host counters → upsert a new row with ConsecutiveFailures + breaker timestamp + sampled timestamp. - Second tick updates the existing row in place (not a second insert). - Multi-host pairs persist independently. - Footprint counters (Baseline + Current) round-trip. - TickCount advances on every PersistOnceAsync call. Full solution dotnet test: 1225 passing (was 1219, +6). Pre-existing Client.CLI Subscribe flake unchanged. Production wiring (Program.cs) example: builder.Services.AddSingleton<DriverResilienceStatusTracker>(); builder.Services.AddHostedService<ResilienceStatusPublisherHostedService>(); // Tracker gets wired into CapabilityInvoker via OtOpcUaServer resolution // + the existing Phase 6.1 layer. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,138 @@
|
|||||||
|
using Microsoft.EntityFrameworkCore;
|
||||||
|
using Microsoft.Extensions.Hosting;
|
||||||
|
using Microsoft.Extensions.Logging;
|
||||||
|
using ZB.MOM.WW.OtOpcUa.Configuration;
|
||||||
|
using ZB.MOM.WW.OtOpcUa.Configuration.Entities;
|
||||||
|
using ZB.MOM.WW.OtOpcUa.Core.Resilience;
|
||||||
|
|
||||||
|
namespace ZB.MOM.WW.OtOpcUa.Server.Hosting;
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Samples <see cref="DriverResilienceStatusTracker"/> at a fixed tick + upserts each
|
||||||
|
/// <c>(DriverInstanceId, HostName)</c> snapshot into <see cref="DriverInstanceResilienceStatus"/>
|
||||||
|
/// so Admin <c>/hosts</c> can render live resilience counters across restarts.
|
||||||
|
/// </summary>
|
||||||
|
/// <remarks>
|
||||||
|
/// <para>Closes the HostedService piece of Phase 6.1 Stream E.2 flagged as a follow-up
|
||||||
|
/// when the tracker shipped in PR #82. The Admin UI column-refresh piece (red badge when
|
||||||
|
/// ConsecutiveFailures > breakerThreshold / 2 + SignalR push) is still deferred to
|
||||||
|
/// the visual-compliance pass — this service owns the persistence half alone.</para>
|
||||||
|
///
|
||||||
|
/// <para>Tick interval defaults to 5 s. Persistence is best-effort: a DB outage during
|
||||||
|
/// a tick logs + continues; the next tick tries again with the latest snapshots. The
|
||||||
|
/// hosted service never crashes the app on sample failure.</para>
|
||||||
|
///
|
||||||
|
/// <para><see cref="PersistOnceAsync"/> factored as a public method so tests can drive
|
||||||
|
/// it directly, matching the <see cref="ScheduledRecycleHostedService.TickOnceAsync"/>
|
||||||
|
/// pattern for deterministic unit-test timing.</para>
|
||||||
|
/// </remarks>
|
||||||
|
public sealed class ResilienceStatusPublisherHostedService : BackgroundService
|
||||||
|
{
|
||||||
|
private readonly DriverResilienceStatusTracker _tracker;
|
||||||
|
private readonly IDbContextFactory<OtOpcUaConfigDbContext> _dbContextFactory;
|
||||||
|
private readonly ILogger<ResilienceStatusPublisherHostedService> _logger;
|
||||||
|
private readonly TimeProvider _timeProvider;
|
||||||
|
|
||||||
|
/// <summary>Tick interval — how often the tracker snapshot is persisted.</summary>
|
||||||
|
public TimeSpan TickInterval { get; }
|
||||||
|
|
||||||
|
/// <summary>Snapshot of the tick count for diagnostics + test assertions.</summary>
|
||||||
|
public int TickCount { get; private set; }
|
||||||
|
|
||||||
|
public ResilienceStatusPublisherHostedService(
|
||||||
|
DriverResilienceStatusTracker tracker,
|
||||||
|
IDbContextFactory<OtOpcUaConfigDbContext> dbContextFactory,
|
||||||
|
ILogger<ResilienceStatusPublisherHostedService> logger,
|
||||||
|
TimeProvider? timeProvider = null,
|
||||||
|
TimeSpan? tickInterval = null)
|
||||||
|
{
|
||||||
|
ArgumentNullException.ThrowIfNull(tracker);
|
||||||
|
ArgumentNullException.ThrowIfNull(dbContextFactory);
|
||||||
|
|
||||||
|
_tracker = tracker;
|
||||||
|
_dbContextFactory = dbContextFactory;
|
||||||
|
_logger = logger;
|
||||||
|
_timeProvider = timeProvider ?? TimeProvider.System;
|
||||||
|
TickInterval = tickInterval ?? TimeSpan.FromSeconds(5);
|
||||||
|
}
|
||||||
|
|
||||||
|
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
|
||||||
|
{
|
||||||
|
_logger.LogInformation(
|
||||||
|
"ResilienceStatusPublisherHostedService starting — tick interval = {Interval}",
|
||||||
|
TickInterval);
|
||||||
|
|
||||||
|
while (!stoppingToken.IsCancellationRequested)
|
||||||
|
{
|
||||||
|
try
|
||||||
|
{
|
||||||
|
await Task.Delay(TickInterval, _timeProvider, stoppingToken).ConfigureAwait(false);
|
||||||
|
}
|
||||||
|
catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested)
|
||||||
|
{
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
await PersistOnceAsync(stoppingToken).ConfigureAwait(false);
|
||||||
|
}
|
||||||
|
|
||||||
|
_logger.LogInformation("ResilienceStatusPublisherHostedService stopping after {TickCount} tick(s).", TickCount);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Take one snapshot of the tracker + upsert each pair into the persistence table.
|
||||||
|
/// Swallows transient exceptions + logs them; never throws from a sample failure.
|
||||||
|
/// </summary>
|
||||||
|
public async Task PersistOnceAsync(CancellationToken cancellationToken)
|
||||||
|
{
|
||||||
|
TickCount++;
|
||||||
|
var snapshot = _tracker.Snapshot();
|
||||||
|
if (snapshot.Count == 0) return;
|
||||||
|
|
||||||
|
try
|
||||||
|
{
|
||||||
|
await using var db = await _dbContextFactory.CreateDbContextAsync(cancellationToken).ConfigureAwait(false);
|
||||||
|
var now = _timeProvider.GetUtcNow().UtcDateTime;
|
||||||
|
|
||||||
|
foreach (var (driverInstanceId, hostName, counters) in snapshot)
|
||||||
|
{
|
||||||
|
var existing = await db.DriverInstanceResilienceStatuses
|
||||||
|
.FirstOrDefaultAsync(x => x.DriverInstanceId == driverInstanceId && x.HostName == hostName, cancellationToken)
|
||||||
|
.ConfigureAwait(false);
|
||||||
|
|
||||||
|
if (existing is null)
|
||||||
|
{
|
||||||
|
db.DriverInstanceResilienceStatuses.Add(new DriverInstanceResilienceStatus
|
||||||
|
{
|
||||||
|
DriverInstanceId = driverInstanceId,
|
||||||
|
HostName = hostName,
|
||||||
|
LastCircuitBreakerOpenUtc = counters.LastBreakerOpenUtc,
|
||||||
|
ConsecutiveFailures = counters.ConsecutiveFailures,
|
||||||
|
CurrentBulkheadDepth = 0, // Phase 6.1 Stream A tracker doesn't emit bulkhead depth yet
|
||||||
|
LastRecycleUtc = counters.LastRecycleUtc,
|
||||||
|
BaselineFootprintBytes = counters.BaselineFootprintBytes,
|
||||||
|
CurrentFootprintBytes = counters.CurrentFootprintBytes,
|
||||||
|
LastSampledUtc = now,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
existing.LastCircuitBreakerOpenUtc = counters.LastBreakerOpenUtc;
|
||||||
|
existing.ConsecutiveFailures = counters.ConsecutiveFailures;
|
||||||
|
existing.LastRecycleUtc = counters.LastRecycleUtc;
|
||||||
|
existing.BaselineFootprintBytes = counters.BaselineFootprintBytes;
|
||||||
|
existing.CurrentFootprintBytes = counters.CurrentFootprintBytes;
|
||||||
|
existing.LastSampledUtc = now;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
await db.SaveChangesAsync(cancellationToken).ConfigureAwait(false);
|
||||||
|
}
|
||||||
|
catch (OperationCanceledException) { throw; }
|
||||||
|
catch (Exception ex)
|
||||||
|
{
|
||||||
|
_logger.LogWarning(ex,
|
||||||
|
"ResilienceStatusPublisher persistence tick failed; next tick will retry with latest snapshots.");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,161 @@
|
|||||||
|
using Microsoft.EntityFrameworkCore;
|
||||||
|
using Microsoft.Extensions.Logging.Abstractions;
|
||||||
|
using Shouldly;
|
||||||
|
using Xunit;
|
||||||
|
using ZB.MOM.WW.OtOpcUa.Configuration;
|
||||||
|
using ZB.MOM.WW.OtOpcUa.Core.Resilience;
|
||||||
|
using ZB.MOM.WW.OtOpcUa.Server.Hosting;
|
||||||
|
|
||||||
|
namespace ZB.MOM.WW.OtOpcUa.Server.Tests;
|
||||||
|
|
||||||
|
[Trait("Category", "Unit")]
|
||||||
|
public sealed class ResilienceStatusPublisherHostedServiceTests : IDisposable
|
||||||
|
{
|
||||||
|
private static readonly DateTime T0 = new(2026, 4, 19, 12, 0, 0, DateTimeKind.Utc);
|
||||||
|
|
||||||
|
private sealed class FakeClock : TimeProvider
|
||||||
|
{
|
||||||
|
public DateTime Utc { get; set; } = T0;
|
||||||
|
public override DateTimeOffset GetUtcNow() => new(Utc, TimeSpan.Zero);
|
||||||
|
}
|
||||||
|
|
||||||
|
private sealed class InMemoryDbContextFactory : IDbContextFactory<OtOpcUaConfigDbContext>
|
||||||
|
{
|
||||||
|
private readonly DbContextOptions<OtOpcUaConfigDbContext> _options;
|
||||||
|
public InMemoryDbContextFactory(string dbName)
|
||||||
|
{
|
||||||
|
_options = new DbContextOptionsBuilder<OtOpcUaConfigDbContext>()
|
||||||
|
.UseInMemoryDatabase(dbName)
|
||||||
|
.Options;
|
||||||
|
}
|
||||||
|
public OtOpcUaConfigDbContext CreateDbContext() => new(_options);
|
||||||
|
}
|
||||||
|
|
||||||
|
private readonly string _dbName = $"resilience-pub-{Guid.NewGuid():N}";
|
||||||
|
private readonly InMemoryDbContextFactory _factory;
|
||||||
|
private readonly OtOpcUaConfigDbContext _readCtx;
|
||||||
|
|
||||||
|
public ResilienceStatusPublisherHostedServiceTests()
|
||||||
|
{
|
||||||
|
_factory = new InMemoryDbContextFactory(_dbName);
|
||||||
|
_readCtx = _factory.CreateDbContext();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void Dispose() => _readCtx.Dispose();
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public async Task EmptyTracker_Tick_NoOp_NoRowsWritten()
|
||||||
|
{
|
||||||
|
var tracker = new DriverResilienceStatusTracker();
|
||||||
|
var host = new ResilienceStatusPublisherHostedService(
|
||||||
|
tracker, _factory, NullLogger<ResilienceStatusPublisherHostedService>.Instance);
|
||||||
|
|
||||||
|
await host.PersistOnceAsync(CancellationToken.None);
|
||||||
|
|
||||||
|
host.TickCount.ShouldBe(1);
|
||||||
|
(await _readCtx.DriverInstanceResilienceStatuses.CountAsync()).ShouldBe(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public async Task SingleHost_OnePairWithCounters_UpsertsNewRow()
|
||||||
|
{
|
||||||
|
var clock = new FakeClock();
|
||||||
|
var tracker = new DriverResilienceStatusTracker();
|
||||||
|
tracker.RecordFailure("drv-1", "plc-a", T0);
|
||||||
|
tracker.RecordFailure("drv-1", "plc-a", T0);
|
||||||
|
tracker.RecordBreakerOpen("drv-1", "plc-a", T0.AddSeconds(1));
|
||||||
|
|
||||||
|
var host = new ResilienceStatusPublisherHostedService(
|
||||||
|
tracker, _factory, NullLogger<ResilienceStatusPublisherHostedService>.Instance,
|
||||||
|
timeProvider: clock);
|
||||||
|
|
||||||
|
clock.Utc = T0.AddSeconds(2);
|
||||||
|
await host.PersistOnceAsync(CancellationToken.None);
|
||||||
|
|
||||||
|
var row = await _readCtx.DriverInstanceResilienceStatuses.SingleAsync();
|
||||||
|
row.DriverInstanceId.ShouldBe("drv-1");
|
||||||
|
row.HostName.ShouldBe("plc-a");
|
||||||
|
row.ConsecutiveFailures.ShouldBe(2);
|
||||||
|
row.LastCircuitBreakerOpenUtc.ShouldBe(T0.AddSeconds(1));
|
||||||
|
row.LastSampledUtc.ShouldBe(T0.AddSeconds(2));
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public async Task SecondTick_UpdatesExistingRow_InPlace()
|
||||||
|
{
|
||||||
|
var clock = new FakeClock();
|
||||||
|
var tracker = new DriverResilienceStatusTracker();
|
||||||
|
tracker.RecordFailure("drv-1", "plc-a", T0);
|
||||||
|
|
||||||
|
var host = new ResilienceStatusPublisherHostedService(
|
||||||
|
tracker, _factory, NullLogger<ResilienceStatusPublisherHostedService>.Instance,
|
||||||
|
timeProvider: clock);
|
||||||
|
|
||||||
|
clock.Utc = T0.AddSeconds(5);
|
||||||
|
await host.PersistOnceAsync(CancellationToken.None);
|
||||||
|
|
||||||
|
// Second tick: success resets the counter.
|
||||||
|
tracker.RecordSuccess("drv-1", "plc-a", T0.AddSeconds(6));
|
||||||
|
clock.Utc = T0.AddSeconds(10);
|
||||||
|
await host.PersistOnceAsync(CancellationToken.None);
|
||||||
|
|
||||||
|
(await _readCtx.DriverInstanceResilienceStatuses.CountAsync()).ShouldBe(1, "one row, updated in place");
|
||||||
|
var row = await _readCtx.DriverInstanceResilienceStatuses.SingleAsync();
|
||||||
|
row.ConsecutiveFailures.ShouldBe(0);
|
||||||
|
row.LastSampledUtc.ShouldBe(T0.AddSeconds(10));
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public async Task MultipleHosts_BothPersist_Independently()
|
||||||
|
{
|
||||||
|
var tracker = new DriverResilienceStatusTracker();
|
||||||
|
tracker.RecordFailure("drv-1", "plc-a", T0);
|
||||||
|
tracker.RecordFailure("drv-1", "plc-a", T0);
|
||||||
|
tracker.RecordFailure("drv-1", "plc-b", T0);
|
||||||
|
|
||||||
|
var host = new ResilienceStatusPublisherHostedService(
|
||||||
|
tracker, _factory, NullLogger<ResilienceStatusPublisherHostedService>.Instance);
|
||||||
|
|
||||||
|
await host.PersistOnceAsync(CancellationToken.None);
|
||||||
|
|
||||||
|
var rows = await _readCtx.DriverInstanceResilienceStatuses
|
||||||
|
.OrderBy(r => r.HostName)
|
||||||
|
.ToListAsync();
|
||||||
|
rows.Count.ShouldBe(2);
|
||||||
|
rows[0].HostName.ShouldBe("plc-a");
|
||||||
|
rows[0].ConsecutiveFailures.ShouldBe(2);
|
||||||
|
rows[1].HostName.ShouldBe("plc-b");
|
||||||
|
rows[1].ConsecutiveFailures.ShouldBe(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public async Task FootprintCounters_Persist()
|
||||||
|
{
|
||||||
|
var tracker = new DriverResilienceStatusTracker();
|
||||||
|
tracker.RecordFootprint("drv-1", "plc-a",
|
||||||
|
baselineBytes: 100_000_000, currentBytes: 150_000_000, T0);
|
||||||
|
|
||||||
|
var host = new ResilienceStatusPublisherHostedService(
|
||||||
|
tracker, _factory, NullLogger<ResilienceStatusPublisherHostedService>.Instance);
|
||||||
|
|
||||||
|
await host.PersistOnceAsync(CancellationToken.None);
|
||||||
|
|
||||||
|
var row = await _readCtx.DriverInstanceResilienceStatuses.SingleAsync();
|
||||||
|
row.BaselineFootprintBytes.ShouldBe(100_000_000);
|
||||||
|
row.CurrentFootprintBytes.ShouldBe(150_000_000);
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public async Task TickCount_Advances_OnEveryCall()
|
||||||
|
{
|
||||||
|
var tracker = new DriverResilienceStatusTracker();
|
||||||
|
var host = new ResilienceStatusPublisherHostedService(
|
||||||
|
tracker, _factory, NullLogger<ResilienceStatusPublisherHostedService>.Instance);
|
||||||
|
|
||||||
|
await host.PersistOnceAsync(CancellationToken.None);
|
||||||
|
await host.PersistOnceAsync(CancellationToken.None);
|
||||||
|
await host.PersistOnceAsync(CancellationToken.None);
|
||||||
|
|
||||||
|
host.TickCount.ShouldBe(3);
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user