Phase 6.3 A.2 + D.1 — GenerationRefreshHostedService: poll + lease-wrap apply

Closes tasks #132 + #118 (GA hardening backlog).

Before this commit, the Server only observed the generation in force at
process start (SealedBootstrap). Peer-published generations accumulated
in the shared config DB while the running node kept serving the
generation it had sealed on boot. Two consequences:

1. Operator role-swaps required a process restart — Admin publishes a
   new generation, but the Server's RedundancyCoordinator never re-read
   the topology.
2. ApplyLeaseRegistry had no apply to wrap. ServiceLevelBand sat at
   PrimaryHealthy (255) during every publish because nothing opened a
   lease; PrimaryMidApply (200) was effectively dead code.

New GenerationRefreshHostedService (src/.../Server/Hosting/):
- Polls sp_GetCurrentGenerationForCluster every 5s (tunable).
- On change: opens leases.BeginApplyLease(newGenerationId, Guid.NewGuid()),
  calls coordinator.RefreshAsync inside the `await using`, releases on
  scope exit (success / exception / cancellation via IAsyncDisposable).
- Diagnostic properties: LastAppliedGenerationId, TickCount, RefreshCount.
- Delegate-injected currentGenerationQuery for test drive-through; real
  path is the private static DefaultQueryCurrentGenerationAsync.
- Registered as HostedService in Program.cs alongside the Phase 6.3
  redundancy / peer-probe stack.

Scope intentionally narrow: only the coordinator refreshes today. Driver
re-init, virtual-tag re-bind, script-engine reload remain as follow-up
wiring. The lease wrap is the right seam for those subscribers to hook
once they grow hot-reload support — the doc comments say so.

Tests
- 5 new unit tests in GenerationRefreshHostedServiceTests (first-apply,
  identity no-op, change-triggers-refresh, null-generation-is-no-op,
  lease-is-released-on-exit). Stub generation-query delegate; real
  coordinator backed by EF InMemory DB.
- Server.Tests total 252 → 257.

Docs
- v2-release-readiness.md Phase 6.3 follow-ups list marks the
  sp_PublishGeneration lease wrap bullet struck-through with close-out
  note.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Joseph Doherty
2026-04-24 15:02:33 -04:00
parent de77d42eab
commit a23de2a7e4
4 changed files with 317 additions and 1 deletions

View File

@@ -0,0 +1,160 @@
using Microsoft.Data.SqlClient;
using Microsoft.Extensions.Hosting;
using Microsoft.Extensions.Logging;
using ZB.MOM.WW.OtOpcUa.Server.Redundancy;
namespace ZB.MOM.WW.OtOpcUa.Server.Hosting;
/// <summary>
/// Phase 6.3 A.2 + Phase 6.1 Stream D follow-up — polls
/// <c>sp_GetCurrentGenerationForCluster</c> on a cadence and, when a newer generation
/// is detected, wraps the apply in an <see cref="ApplyLeaseRegistry"/> lease
/// (flipping ServiceLevel to <see cref="ServiceLevelBand.PrimaryMidApply"/>) and
/// refreshes the <see cref="RedundancyCoordinator"/> so operator role-swaps take
/// effect without a process restart.
/// </summary>
/// <remarks>
/// <para>
/// Before this service shipped, the Server only ever saw the generation in force
/// at process start (<see cref="SealedBootstrap"/>). Peer-published generations
/// silently accumulated in the shared config DB; the running node kept serving
/// the generation it had sealed on boot until the operator restarted it.
/// </para>
/// <para>
/// Closes the Phase 6.3 D.1 design hole around <c>PrimaryMidApply</c>: the
/// <c>coordinator.BeginApplyLease(...)</c> wrap now encloses an actual apply
/// (the coordinator refresh + future subscriber fan-out). Lease dispose fires
/// on every exit path — success, exception, cancellation — so
/// <c>ApplyLeaseRegistry</c> can never pin a crashed refresh at
/// PrimaryMidApply.
/// </para>
/// <para>
/// Deliberately narrow scope: refreshes <see cref="RedundancyCoordinator"/>
/// only. Driver re-init, virtual-tag re-bind, script-engine reload, etc. remain
/// as follow-up wiring — add subscribers to this service's apply path as those
/// components grow hot-reload support. The lease wrap is the right seam for
/// those subscribers to hook.
/// </para>
/// </remarks>
public sealed class GenerationRefreshHostedService(
NodeOptions options,
ApplyLeaseRegistry leases,
RedundancyCoordinator coordinator,
ILogger<GenerationRefreshHostedService> logger,
TimeSpan? tickInterval = null,
Func<CancellationToken, Task<long?>>? currentGenerationQuery = null) : BackgroundService
{
private readonly Func<CancellationToken, Task<long?>> _generationQuery = currentGenerationQuery
?? new Func<CancellationToken, Task<long?>>(ct => DefaultQueryCurrentGenerationAsync(options, logger, ct));
/// <summary>
/// How often the service polls <c>sp_GetCurrentGenerationForCluster</c>. Default 5 s —
/// low enough that operator publishes take effect promptly, high enough that the
/// overhead on the central DB is negligible even across a 100-node fleet.
/// </summary>
public TimeSpan TickInterval { get; } = tickInterval ?? TimeSpan.FromSeconds(5);
/// <summary>
/// Newest generation the service has applied. Exposed for diagnostics +
/// <see cref="TickCount"/> style health surfaces. <c>null</c> before the first
/// successful poll.
/// </summary>
public long? LastAppliedGenerationId { get; private set; }
/// <summary>Successful ticks — whether or not a generation change was detected.</summary>
public int TickCount { get; private set; }
/// <summary>Ticks that observed a generation change and ran a refresh.</summary>
public int RefreshCount { get; private set; }
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
{
logger.LogInformation(
"GenerationRefreshHostedService running — polling every {Tick}s",
TickInterval.TotalSeconds);
while (!stoppingToken.IsCancellationRequested)
{
try
{
await TickAsync(stoppingToken).ConfigureAwait(false);
}
catch (Exception ex) when (ex is not OperationCanceledException)
{
logger.LogWarning(ex, "GenerationRefreshHostedService tick failed");
}
try { await Task.Delay(TickInterval, stoppingToken).ConfigureAwait(false); }
catch (OperationCanceledException) { break; }
}
}
// internal for tests — single-tick entry point.
internal async Task TickAsync(CancellationToken cancellationToken)
{
var current = await _generationQuery(cancellationToken).ConfigureAwait(false);
TickCount++;
if (current is null) return;
if (LastAppliedGenerationId is long last && current == last)
{
return; // no change
}
logger.LogInformation(
"Generation change detected — {Previous} → {Current}; applying",
LastAppliedGenerationId?.ToString() ?? "(none)", current);
// Lease wraps the apply window: ServiceLevelCalculator reads
// ApplyLeaseRegistry.IsApplyInProgress and returns PrimaryMidApply (200) while any
// lease is open. Publisher ticks in parallel (1s cadence) will observe the band
// transition and push it onto the OPC UA Server.ServiceLevel node.
var publishRequestId = Guid.NewGuid();
await using (leases.BeginApplyLease(current.Value, publishRequestId))
{
await coordinator.RefreshAsync(cancellationToken).ConfigureAwait(false);
// Future: fire a domain event that driver hosts / virtual-tag engine /
// scripted-alarm engine subscribe to. For now the topology refresh is the
// only thing we rewire — everything else still requires a process restart.
}
LastAppliedGenerationId = current;
RefreshCount++;
}
/// <summary>
/// Default generation-query implementation — reads via
/// <c>sp_GetCurrentGenerationForCluster</c>. Returns <c>null</c> when no generation
/// has been published yet, or when the DB call fails (logged at Warning; next tick
/// retries). Tests inject a stub <see cref="Func{CancellationToken, Task}"/> via the
/// <c>currentGenerationQuery</c> constructor parameter instead.
/// </summary>
private static async Task<long?> DefaultQueryCurrentGenerationAsync(
NodeOptions options,
ILogger logger,
CancellationToken cancellationToken)
{
try
{
await using var conn = new SqlConnection(options.ConfigDbConnectionString);
await conn.OpenAsync(cancellationToken).ConfigureAwait(false);
await using var cmd = conn.CreateCommand();
cmd.CommandText = "EXEC dbo.sp_GetCurrentGenerationForCluster @NodeId=@n, @ClusterId=@c";
cmd.Parameters.AddWithValue("@n", options.NodeId);
cmd.Parameters.AddWithValue("@c", options.ClusterId);
await using var reader = await cmd.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
if (!await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
{
return null;
}
return reader.GetInt64(0);
}
catch (Exception ex) when (ex is not OperationCanceledException)
{
logger.LogWarning(ex, "sp_GetCurrentGenerationForCluster failed — will retry");
return null;
}
}
}

View File

@@ -173,6 +173,12 @@ builder.Services.AddHttpClient(PeerHttpProbeLoop.HttpClientName);
builder.Services.AddHostedService<PeerHttpProbeLoop>();
builder.Services.AddHostedService<PeerUaProbeLoop>();
// Phase 6.3 A.2 + 6.1 Stream D — periodic generation refresh. Detects peer-published
// generations, opens an ApplyLeaseRegistry lease during the refresh window (so the
// publisher surfaces PrimaryMidApply=200 instead of sitting at PrimaryHealthy=255
// through the apply), and calls coordinator.RefreshAsync to pick up topology changes.
builder.Services.AddHostedService<GenerationRefreshHostedService>();
// Phase 7 follow-up #246 — historian sink + engine composer. NullAlarmHistorianSink
// is the default until the Galaxy.Host SqliteStoreAndForwardSink writer adapter
// lands (task #248). The composer reads Script/VirtualTag/ScriptedAlarm rows on