From 016122841b577daff25385acce01f1f53587bad6 Mon Sep 17 00:00:00 2001 From: Joseph Doherty Date: Sun, 19 Apr 2026 14:36:00 -0400 Subject: [PATCH] =?UTF-8?q?Phase=206.1=20Stream=20E.2=20partial=20?= =?UTF-8?q?=E2=80=94=20ResilienceStatusPublisherHostedService=20persists?= =?UTF-8?q?=20tracker=20snapshots=20to=20DB?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes the HostedService half of Phase 6.1 Stream E.2 flagged as a follow-up when the DriverResilienceStatusTracker shipped in PR #82. The Admin /hosts column refresh + SignalR push + red-badge visual (Stream E.3) remain deferred to the visual-compliance pass — this PR owns the persistence story alone. Server.Hosting: - ResilienceStatusPublisherHostedService : BackgroundService. Samples the DriverResilienceStatusTracker every TickInterval (default 5 s) and upserts each (DriverInstanceId, HostName) counter pair into DriverInstanceResilienceStatus via EF. New rows on first sight; in-place updates on subsequent ticks. - PersistOnceAsync extracted public so tests drive one tick directly — matches the ScheduledRecycleHostedService pattern for deterministic timing. - Best-effort persistence: a DB outage logs a warning + continues; the next tick retries. Never crashes the app on sample failure. Cancellation propagates through cleanly. - Tracks the bulkhead depth / recycle / footprint columns the entity was designed for. CurrentBulkheadDepth currently persisted as 0 — the tracker doesn't yet expose live bulkhead depth; a narrower follow-up wires the Polly bulkhead-depth observer into the tracker. Tests (6 new in ResilienceStatusPublisherHostedServiceTests): - Empty tracker → tick is a no-op, zero rows written. - Single-host counters → upsert a new row with ConsecutiveFailures + breaker timestamp + sampled timestamp. - Second tick updates the existing row in place (not a second insert). - Multi-host pairs persist independently. - Footprint counters (Baseline + Current) round-trip. - TickCount advances on every PersistOnceAsync call. Full solution dotnet test: 1225 passing (was 1219, +6). Pre-existing Client.CLI Subscribe flake unchanged. Production wiring (Program.cs) example: builder.Services.AddSingleton(); builder.Services.AddHostedService(); // Tracker gets wired into CapabilityInvoker via OtOpcUaServer resolution // + the existing Phase 6.1 layer. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../ResilienceStatusPublisherHostedService.cs | 138 +++++++++++++++ ...lienceStatusPublisherHostedServiceTests.cs | 161 ++++++++++++++++++ 2 files changed, 299 insertions(+) create mode 100644 src/ZB.MOM.WW.OtOpcUa.Server/Hosting/ResilienceStatusPublisherHostedService.cs create mode 100644 tests/ZB.MOM.WW.OtOpcUa.Server.Tests/ResilienceStatusPublisherHostedServiceTests.cs diff --git a/src/ZB.MOM.WW.OtOpcUa.Server/Hosting/ResilienceStatusPublisherHostedService.cs b/src/ZB.MOM.WW.OtOpcUa.Server/Hosting/ResilienceStatusPublisherHostedService.cs new file mode 100644 index 0000000..777be52 --- /dev/null +++ b/src/ZB.MOM.WW.OtOpcUa.Server/Hosting/ResilienceStatusPublisherHostedService.cs @@ -0,0 +1,138 @@ +using Microsoft.EntityFrameworkCore; +using Microsoft.Extensions.Hosting; +using Microsoft.Extensions.Logging; +using ZB.MOM.WW.OtOpcUa.Configuration; +using ZB.MOM.WW.OtOpcUa.Configuration.Entities; +using ZB.MOM.WW.OtOpcUa.Core.Resilience; + +namespace ZB.MOM.WW.OtOpcUa.Server.Hosting; + +/// +/// Samples at a fixed tick + upserts each +/// (DriverInstanceId, HostName) snapshot into +/// so Admin /hosts can render live resilience counters across restarts. +/// +/// +/// Closes the HostedService piece of Phase 6.1 Stream E.2 flagged as a follow-up +/// when the tracker shipped in PR #82. The Admin UI column-refresh piece (red badge when +/// ConsecutiveFailures > breakerThreshold / 2 + SignalR push) is still deferred to +/// the visual-compliance pass — this service owns the persistence half alone. +/// +/// Tick interval defaults to 5 s. Persistence is best-effort: a DB outage during +/// a tick logs + continues; the next tick tries again with the latest snapshots. The +/// hosted service never crashes the app on sample failure. +/// +/// factored as a public method so tests can drive +/// it directly, matching the +/// pattern for deterministic unit-test timing. +/// +public sealed class ResilienceStatusPublisherHostedService : BackgroundService +{ + private readonly DriverResilienceStatusTracker _tracker; + private readonly IDbContextFactory _dbContextFactory; + private readonly ILogger _logger; + private readonly TimeProvider _timeProvider; + + /// Tick interval — how often the tracker snapshot is persisted. + public TimeSpan TickInterval { get; } + + /// Snapshot of the tick count for diagnostics + test assertions. + public int TickCount { get; private set; } + + public ResilienceStatusPublisherHostedService( + DriverResilienceStatusTracker tracker, + IDbContextFactory dbContextFactory, + ILogger logger, + TimeProvider? timeProvider = null, + TimeSpan? tickInterval = null) + { + ArgumentNullException.ThrowIfNull(tracker); + ArgumentNullException.ThrowIfNull(dbContextFactory); + + _tracker = tracker; + _dbContextFactory = dbContextFactory; + _logger = logger; + _timeProvider = timeProvider ?? TimeProvider.System; + TickInterval = tickInterval ?? TimeSpan.FromSeconds(5); + } + + protected override async Task ExecuteAsync(CancellationToken stoppingToken) + { + _logger.LogInformation( + "ResilienceStatusPublisherHostedService starting — tick interval = {Interval}", + TickInterval); + + while (!stoppingToken.IsCancellationRequested) + { + try + { + await Task.Delay(TickInterval, _timeProvider, stoppingToken).ConfigureAwait(false); + } + catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested) + { + break; + } + + await PersistOnceAsync(stoppingToken).ConfigureAwait(false); + } + + _logger.LogInformation("ResilienceStatusPublisherHostedService stopping after {TickCount} tick(s).", TickCount); + } + + /// + /// Take one snapshot of the tracker + upsert each pair into the persistence table. + /// Swallows transient exceptions + logs them; never throws from a sample failure. + /// + public async Task PersistOnceAsync(CancellationToken cancellationToken) + { + TickCount++; + var snapshot = _tracker.Snapshot(); + if (snapshot.Count == 0) return; + + try + { + await using var db = await _dbContextFactory.CreateDbContextAsync(cancellationToken).ConfigureAwait(false); + var now = _timeProvider.GetUtcNow().UtcDateTime; + + foreach (var (driverInstanceId, hostName, counters) in snapshot) + { + var existing = await db.DriverInstanceResilienceStatuses + .FirstOrDefaultAsync(x => x.DriverInstanceId == driverInstanceId && x.HostName == hostName, cancellationToken) + .ConfigureAwait(false); + + if (existing is null) + { + db.DriverInstanceResilienceStatuses.Add(new DriverInstanceResilienceStatus + { + DriverInstanceId = driverInstanceId, + HostName = hostName, + LastCircuitBreakerOpenUtc = counters.LastBreakerOpenUtc, + ConsecutiveFailures = counters.ConsecutiveFailures, + CurrentBulkheadDepth = 0, // Phase 6.1 Stream A tracker doesn't emit bulkhead depth yet + LastRecycleUtc = counters.LastRecycleUtc, + BaselineFootprintBytes = counters.BaselineFootprintBytes, + CurrentFootprintBytes = counters.CurrentFootprintBytes, + LastSampledUtc = now, + }); + } + else + { + existing.LastCircuitBreakerOpenUtc = counters.LastBreakerOpenUtc; + existing.ConsecutiveFailures = counters.ConsecutiveFailures; + existing.LastRecycleUtc = counters.LastRecycleUtc; + existing.BaselineFootprintBytes = counters.BaselineFootprintBytes; + existing.CurrentFootprintBytes = counters.CurrentFootprintBytes; + existing.LastSampledUtc = now; + } + } + + await db.SaveChangesAsync(cancellationToken).ConfigureAwait(false); + } + catch (OperationCanceledException) { throw; } + catch (Exception ex) + { + _logger.LogWarning(ex, + "ResilienceStatusPublisher persistence tick failed; next tick will retry with latest snapshots."); + } + } +} diff --git a/tests/ZB.MOM.WW.OtOpcUa.Server.Tests/ResilienceStatusPublisherHostedServiceTests.cs b/tests/ZB.MOM.WW.OtOpcUa.Server.Tests/ResilienceStatusPublisherHostedServiceTests.cs new file mode 100644 index 0000000..9bbfc7f --- /dev/null +++ b/tests/ZB.MOM.WW.OtOpcUa.Server.Tests/ResilienceStatusPublisherHostedServiceTests.cs @@ -0,0 +1,161 @@ +using Microsoft.EntityFrameworkCore; +using Microsoft.Extensions.Logging.Abstractions; +using Shouldly; +using Xunit; +using ZB.MOM.WW.OtOpcUa.Configuration; +using ZB.MOM.WW.OtOpcUa.Core.Resilience; +using ZB.MOM.WW.OtOpcUa.Server.Hosting; + +namespace ZB.MOM.WW.OtOpcUa.Server.Tests; + +[Trait("Category", "Unit")] +public sealed class ResilienceStatusPublisherHostedServiceTests : IDisposable +{ + private static readonly DateTime T0 = new(2026, 4, 19, 12, 0, 0, DateTimeKind.Utc); + + private sealed class FakeClock : TimeProvider + { + public DateTime Utc { get; set; } = T0; + public override DateTimeOffset GetUtcNow() => new(Utc, TimeSpan.Zero); + } + + private sealed class InMemoryDbContextFactory : IDbContextFactory + { + private readonly DbContextOptions _options; + public InMemoryDbContextFactory(string dbName) + { + _options = new DbContextOptionsBuilder() + .UseInMemoryDatabase(dbName) + .Options; + } + public OtOpcUaConfigDbContext CreateDbContext() => new(_options); + } + + private readonly string _dbName = $"resilience-pub-{Guid.NewGuid():N}"; + private readonly InMemoryDbContextFactory _factory; + private readonly OtOpcUaConfigDbContext _readCtx; + + public ResilienceStatusPublisherHostedServiceTests() + { + _factory = new InMemoryDbContextFactory(_dbName); + _readCtx = _factory.CreateDbContext(); + } + + public void Dispose() => _readCtx.Dispose(); + + [Fact] + public async Task EmptyTracker_Tick_NoOp_NoRowsWritten() + { + var tracker = new DriverResilienceStatusTracker(); + var host = new ResilienceStatusPublisherHostedService( + tracker, _factory, NullLogger.Instance); + + await host.PersistOnceAsync(CancellationToken.None); + + host.TickCount.ShouldBe(1); + (await _readCtx.DriverInstanceResilienceStatuses.CountAsync()).ShouldBe(0); + } + + [Fact] + public async Task SingleHost_OnePairWithCounters_UpsertsNewRow() + { + var clock = new FakeClock(); + var tracker = new DriverResilienceStatusTracker(); + tracker.RecordFailure("drv-1", "plc-a", T0); + tracker.RecordFailure("drv-1", "plc-a", T0); + tracker.RecordBreakerOpen("drv-1", "plc-a", T0.AddSeconds(1)); + + var host = new ResilienceStatusPublisherHostedService( + tracker, _factory, NullLogger.Instance, + timeProvider: clock); + + clock.Utc = T0.AddSeconds(2); + await host.PersistOnceAsync(CancellationToken.None); + + var row = await _readCtx.DriverInstanceResilienceStatuses.SingleAsync(); + row.DriverInstanceId.ShouldBe("drv-1"); + row.HostName.ShouldBe("plc-a"); + row.ConsecutiveFailures.ShouldBe(2); + row.LastCircuitBreakerOpenUtc.ShouldBe(T0.AddSeconds(1)); + row.LastSampledUtc.ShouldBe(T0.AddSeconds(2)); + } + + [Fact] + public async Task SecondTick_UpdatesExistingRow_InPlace() + { + var clock = new FakeClock(); + var tracker = new DriverResilienceStatusTracker(); + tracker.RecordFailure("drv-1", "plc-a", T0); + + var host = new ResilienceStatusPublisherHostedService( + tracker, _factory, NullLogger.Instance, + timeProvider: clock); + + clock.Utc = T0.AddSeconds(5); + await host.PersistOnceAsync(CancellationToken.None); + + // Second tick: success resets the counter. + tracker.RecordSuccess("drv-1", "plc-a", T0.AddSeconds(6)); + clock.Utc = T0.AddSeconds(10); + await host.PersistOnceAsync(CancellationToken.None); + + (await _readCtx.DriverInstanceResilienceStatuses.CountAsync()).ShouldBe(1, "one row, updated in place"); + var row = await _readCtx.DriverInstanceResilienceStatuses.SingleAsync(); + row.ConsecutiveFailures.ShouldBe(0); + row.LastSampledUtc.ShouldBe(T0.AddSeconds(10)); + } + + [Fact] + public async Task MultipleHosts_BothPersist_Independently() + { + var tracker = new DriverResilienceStatusTracker(); + tracker.RecordFailure("drv-1", "plc-a", T0); + tracker.RecordFailure("drv-1", "plc-a", T0); + tracker.RecordFailure("drv-1", "plc-b", T0); + + var host = new ResilienceStatusPublisherHostedService( + tracker, _factory, NullLogger.Instance); + + await host.PersistOnceAsync(CancellationToken.None); + + var rows = await _readCtx.DriverInstanceResilienceStatuses + .OrderBy(r => r.HostName) + .ToListAsync(); + rows.Count.ShouldBe(2); + rows[0].HostName.ShouldBe("plc-a"); + rows[0].ConsecutiveFailures.ShouldBe(2); + rows[1].HostName.ShouldBe("plc-b"); + rows[1].ConsecutiveFailures.ShouldBe(1); + } + + [Fact] + public async Task FootprintCounters_Persist() + { + var tracker = new DriverResilienceStatusTracker(); + tracker.RecordFootprint("drv-1", "plc-a", + baselineBytes: 100_000_000, currentBytes: 150_000_000, T0); + + var host = new ResilienceStatusPublisherHostedService( + tracker, _factory, NullLogger.Instance); + + await host.PersistOnceAsync(CancellationToken.None); + + var row = await _readCtx.DriverInstanceResilienceStatuses.SingleAsync(); + row.BaselineFootprintBytes.ShouldBe(100_000_000); + row.CurrentFootprintBytes.ShouldBe(150_000_000); + } + + [Fact] + public async Task TickCount_Advances_OnEveryCall() + { + var tracker = new DriverResilienceStatusTracker(); + var host = new ResilienceStatusPublisherHostedService( + tracker, _factory, NullLogger.Instance); + + await host.PersistOnceAsync(CancellationToken.None); + await host.PersistOnceAsync(CancellationToken.None); + await host.PersistOnceAsync(CancellationToken.None); + + host.TickCount.ShouldBe(3); + } +} -- 2.49.1