Phase 6.1 Stream E.2 partial - ResilienceStatusPublisherHostedService #105

Merged
dohertj2 merged 1 commits from phase-6-1-resilience-status-publisher into v2 2026-04-19 14:37:54 -04:00
2 changed files with 299 additions and 0 deletions

View File

@@ -0,0 +1,138 @@
using Microsoft.EntityFrameworkCore;
using Microsoft.Extensions.Hosting;
using Microsoft.Extensions.Logging;
using ZB.MOM.WW.OtOpcUa.Configuration;
using ZB.MOM.WW.OtOpcUa.Configuration.Entities;
using ZB.MOM.WW.OtOpcUa.Core.Resilience;
namespace ZB.MOM.WW.OtOpcUa.Server.Hosting;
/// <summary>
/// Samples <see cref="DriverResilienceStatusTracker"/> at a fixed tick + upserts each
/// <c>(DriverInstanceId, HostName)</c> snapshot into <see cref="DriverInstanceResilienceStatus"/>
/// so Admin <c>/hosts</c> can render live resilience counters across restarts.
/// </summary>
/// <remarks>
/// <para>Closes the HostedService piece of Phase 6.1 Stream E.2 flagged as a follow-up
/// when the tracker shipped in PR #82. The Admin UI column-refresh piece (red badge when
/// ConsecutiveFailures &gt; breakerThreshold / 2 + SignalR push) is still deferred to
/// the visual-compliance pass — this service owns the persistence half alone.</para>
///
/// <para>Tick interval defaults to 5 s. Persistence is best-effort: a DB outage during
/// a tick logs + continues; the next tick tries again with the latest snapshots. The
/// hosted service never crashes the app on sample failure.</para>
///
/// <para><see cref="PersistOnceAsync"/> factored as a public method so tests can drive
/// it directly, matching the <see cref="ScheduledRecycleHostedService.TickOnceAsync"/>
/// pattern for deterministic unit-test timing.</para>
/// </remarks>
public sealed class ResilienceStatusPublisherHostedService : BackgroundService
{
private readonly DriverResilienceStatusTracker _tracker;
private readonly IDbContextFactory<OtOpcUaConfigDbContext> _dbContextFactory;
private readonly ILogger<ResilienceStatusPublisherHostedService> _logger;
private readonly TimeProvider _timeProvider;
/// <summary>Tick interval — how often the tracker snapshot is persisted.</summary>
public TimeSpan TickInterval { get; }
/// <summary>Snapshot of the tick count for diagnostics + test assertions.</summary>
public int TickCount { get; private set; }
public ResilienceStatusPublisherHostedService(
DriverResilienceStatusTracker tracker,
IDbContextFactory<OtOpcUaConfigDbContext> dbContextFactory,
ILogger<ResilienceStatusPublisherHostedService> logger,
TimeProvider? timeProvider = null,
TimeSpan? tickInterval = null)
{
ArgumentNullException.ThrowIfNull(tracker);
ArgumentNullException.ThrowIfNull(dbContextFactory);
_tracker = tracker;
_dbContextFactory = dbContextFactory;
_logger = logger;
_timeProvider = timeProvider ?? TimeProvider.System;
TickInterval = tickInterval ?? TimeSpan.FromSeconds(5);
}
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
{
_logger.LogInformation(
"ResilienceStatusPublisherHostedService starting — tick interval = {Interval}",
TickInterval);
while (!stoppingToken.IsCancellationRequested)
{
try
{
await Task.Delay(TickInterval, _timeProvider, stoppingToken).ConfigureAwait(false);
}
catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested)
{
break;
}
await PersistOnceAsync(stoppingToken).ConfigureAwait(false);
}
_logger.LogInformation("ResilienceStatusPublisherHostedService stopping after {TickCount} tick(s).", TickCount);
}
/// <summary>
/// Take one snapshot of the tracker + upsert each pair into the persistence table.
/// Swallows transient exceptions + logs them; never throws from a sample failure.
/// </summary>
public async Task PersistOnceAsync(CancellationToken cancellationToken)
{
TickCount++;
var snapshot = _tracker.Snapshot();
if (snapshot.Count == 0) return;
try
{
await using var db = await _dbContextFactory.CreateDbContextAsync(cancellationToken).ConfigureAwait(false);
var now = _timeProvider.GetUtcNow().UtcDateTime;
foreach (var (driverInstanceId, hostName, counters) in snapshot)
{
var existing = await db.DriverInstanceResilienceStatuses
.FirstOrDefaultAsync(x => x.DriverInstanceId == driverInstanceId && x.HostName == hostName, cancellationToken)
.ConfigureAwait(false);
if (existing is null)
{
db.DriverInstanceResilienceStatuses.Add(new DriverInstanceResilienceStatus
{
DriverInstanceId = driverInstanceId,
HostName = hostName,
LastCircuitBreakerOpenUtc = counters.LastBreakerOpenUtc,
ConsecutiveFailures = counters.ConsecutiveFailures,
CurrentBulkheadDepth = 0, // Phase 6.1 Stream A tracker doesn't emit bulkhead depth yet
LastRecycleUtc = counters.LastRecycleUtc,
BaselineFootprintBytes = counters.BaselineFootprintBytes,
CurrentFootprintBytes = counters.CurrentFootprintBytes,
LastSampledUtc = now,
});
}
else
{
existing.LastCircuitBreakerOpenUtc = counters.LastBreakerOpenUtc;
existing.ConsecutiveFailures = counters.ConsecutiveFailures;
existing.LastRecycleUtc = counters.LastRecycleUtc;
existing.BaselineFootprintBytes = counters.BaselineFootprintBytes;
existing.CurrentFootprintBytes = counters.CurrentFootprintBytes;
existing.LastSampledUtc = now;
}
}
await db.SaveChangesAsync(cancellationToken).ConfigureAwait(false);
}
catch (OperationCanceledException) { throw; }
catch (Exception ex)
{
_logger.LogWarning(ex,
"ResilienceStatusPublisher persistence tick failed; next tick will retry with latest snapshots.");
}
}
}

View File

@@ -0,0 +1,161 @@
using Microsoft.EntityFrameworkCore;
using Microsoft.Extensions.Logging.Abstractions;
using Shouldly;
using Xunit;
using ZB.MOM.WW.OtOpcUa.Configuration;
using ZB.MOM.WW.OtOpcUa.Core.Resilience;
using ZB.MOM.WW.OtOpcUa.Server.Hosting;
namespace ZB.MOM.WW.OtOpcUa.Server.Tests;
[Trait("Category", "Unit")]
public sealed class ResilienceStatusPublisherHostedServiceTests : IDisposable
{
private static readonly DateTime T0 = new(2026, 4, 19, 12, 0, 0, DateTimeKind.Utc);
private sealed class FakeClock : TimeProvider
{
public DateTime Utc { get; set; } = T0;
public override DateTimeOffset GetUtcNow() => new(Utc, TimeSpan.Zero);
}
private sealed class InMemoryDbContextFactory : IDbContextFactory<OtOpcUaConfigDbContext>
{
private readonly DbContextOptions<OtOpcUaConfigDbContext> _options;
public InMemoryDbContextFactory(string dbName)
{
_options = new DbContextOptionsBuilder<OtOpcUaConfigDbContext>()
.UseInMemoryDatabase(dbName)
.Options;
}
public OtOpcUaConfigDbContext CreateDbContext() => new(_options);
}
private readonly string _dbName = $"resilience-pub-{Guid.NewGuid():N}";
private readonly InMemoryDbContextFactory _factory;
private readonly OtOpcUaConfigDbContext _readCtx;
public ResilienceStatusPublisherHostedServiceTests()
{
_factory = new InMemoryDbContextFactory(_dbName);
_readCtx = _factory.CreateDbContext();
}
public void Dispose() => _readCtx.Dispose();
[Fact]
public async Task EmptyTracker_Tick_NoOp_NoRowsWritten()
{
var tracker = new DriverResilienceStatusTracker();
var host = new ResilienceStatusPublisherHostedService(
tracker, _factory, NullLogger<ResilienceStatusPublisherHostedService>.Instance);
await host.PersistOnceAsync(CancellationToken.None);
host.TickCount.ShouldBe(1);
(await _readCtx.DriverInstanceResilienceStatuses.CountAsync()).ShouldBe(0);
}
[Fact]
public async Task SingleHost_OnePairWithCounters_UpsertsNewRow()
{
var clock = new FakeClock();
var tracker = new DriverResilienceStatusTracker();
tracker.RecordFailure("drv-1", "plc-a", T0);
tracker.RecordFailure("drv-1", "plc-a", T0);
tracker.RecordBreakerOpen("drv-1", "plc-a", T0.AddSeconds(1));
var host = new ResilienceStatusPublisherHostedService(
tracker, _factory, NullLogger<ResilienceStatusPublisherHostedService>.Instance,
timeProvider: clock);
clock.Utc = T0.AddSeconds(2);
await host.PersistOnceAsync(CancellationToken.None);
var row = await _readCtx.DriverInstanceResilienceStatuses.SingleAsync();
row.DriverInstanceId.ShouldBe("drv-1");
row.HostName.ShouldBe("plc-a");
row.ConsecutiveFailures.ShouldBe(2);
row.LastCircuitBreakerOpenUtc.ShouldBe(T0.AddSeconds(1));
row.LastSampledUtc.ShouldBe(T0.AddSeconds(2));
}
[Fact]
public async Task SecondTick_UpdatesExistingRow_InPlace()
{
var clock = new FakeClock();
var tracker = new DriverResilienceStatusTracker();
tracker.RecordFailure("drv-1", "plc-a", T0);
var host = new ResilienceStatusPublisherHostedService(
tracker, _factory, NullLogger<ResilienceStatusPublisherHostedService>.Instance,
timeProvider: clock);
clock.Utc = T0.AddSeconds(5);
await host.PersistOnceAsync(CancellationToken.None);
// Second tick: success resets the counter.
tracker.RecordSuccess("drv-1", "plc-a", T0.AddSeconds(6));
clock.Utc = T0.AddSeconds(10);
await host.PersistOnceAsync(CancellationToken.None);
(await _readCtx.DriverInstanceResilienceStatuses.CountAsync()).ShouldBe(1, "one row, updated in place");
var row = await _readCtx.DriverInstanceResilienceStatuses.SingleAsync();
row.ConsecutiveFailures.ShouldBe(0);
row.LastSampledUtc.ShouldBe(T0.AddSeconds(10));
}
[Fact]
public async Task MultipleHosts_BothPersist_Independently()
{
var tracker = new DriverResilienceStatusTracker();
tracker.RecordFailure("drv-1", "plc-a", T0);
tracker.RecordFailure("drv-1", "plc-a", T0);
tracker.RecordFailure("drv-1", "plc-b", T0);
var host = new ResilienceStatusPublisherHostedService(
tracker, _factory, NullLogger<ResilienceStatusPublisherHostedService>.Instance);
await host.PersistOnceAsync(CancellationToken.None);
var rows = await _readCtx.DriverInstanceResilienceStatuses
.OrderBy(r => r.HostName)
.ToListAsync();
rows.Count.ShouldBe(2);
rows[0].HostName.ShouldBe("plc-a");
rows[0].ConsecutiveFailures.ShouldBe(2);
rows[1].HostName.ShouldBe("plc-b");
rows[1].ConsecutiveFailures.ShouldBe(1);
}
[Fact]
public async Task FootprintCounters_Persist()
{
var tracker = new DriverResilienceStatusTracker();
tracker.RecordFootprint("drv-1", "plc-a",
baselineBytes: 100_000_000, currentBytes: 150_000_000, T0);
var host = new ResilienceStatusPublisherHostedService(
tracker, _factory, NullLogger<ResilienceStatusPublisherHostedService>.Instance);
await host.PersistOnceAsync(CancellationToken.None);
var row = await _readCtx.DriverInstanceResilienceStatuses.SingleAsync();
row.BaselineFootprintBytes.ShouldBe(100_000_000);
row.CurrentFootprintBytes.ShouldBe(150_000_000);
}
[Fact]
public async Task TickCount_Advances_OnEveryCall()
{
var tracker = new DriverResilienceStatusTracker();
var host = new ResilienceStatusPublisherHostedService(
tracker, _factory, NullLogger<ResilienceStatusPublisherHostedService>.Instance);
await host.PersistOnceAsync(CancellationToken.None);
await host.PersistOnceAsync(CancellationToken.None);
await host.PersistOnceAsync(CancellationToken.None);
host.TickCount.ShouldBe(3);
}
}