fix(server, admin): wire sp_RegisterNodeGenerationApplied + overlay heartbeat onto ClusterNode

dbo.sp_RegisterNodeGenerationApplied was defined by the initial
StoredProcedures migration but had zero callers in src/. The server
polled sp_GetCurrentGenerationForCluster every 5s but never reported
back, so dbo.ClusterNodeGenerationState stayed empty for every node
and both the Admin UI Fleet status page ("No node state recorded")
and the cluster-detail Redundancy LastSeenAt indicator ("never
STALE") showed broken liveness forever.

Server side (GenerationRefreshHostedService):
* New testable seam: Func<long, NodeApplyStatus, string?, CT, Task>?
  registerAppliedAsync constructor parameter, defaulting to a real
  sp_RegisterNodeGenerationApplied call against the central DB.
* TickAsync now calls the proc at two points: after every successful
  apply with NodeApplyStatus.Applied, and on every no-change tick as
  a heartbeat (also Applied) so LastSeenAt stays fresh.
* Apply failures now wrap the lease + coordinator.RefreshAsync in a
  try/catch, report NodeApplyStatus.Failed with the exception message,
  and advance LastAppliedGenerationId regardless of outcome so we
  don't loop on the same broken apply every 5s.
* Register-call failures are best-effort (LogDebug heartbeat, LogWarning
  apply-report) — a transient DB outage during reporting must not
  crash the publisher or block the next apply.

Admin side (ClusterNodeService.ListByClusterAsync): the Redundancy tab
reads ClusterNode.LastSeenAt, but no current writer maintains that
column — the heartbeat goes to ClusterNodeGenerationState.LastSeenAt.
Overlay the GenerationState heartbeat onto the returned ClusterNode
rows when more recent, so IsStale + the Redundancy table column
reflect actual liveness without a schema change or new write path.

Tests: 3 new cases on GenerationRefreshHostedServiceTests verify
first-apply reports Applied, no-change ticks heartbeat with Applied,
and register-call failure does not roll back the cursor or block
subsequent ticks. All 8 GenerationRefresh tests pass.

Verified live on node-dev-a / cluster-dev: dbo.ClusterNodeGenerationState
now populated with CurrentGenerationId=1, LastAppliedStatus=Applied,
fresh LastSeenAt. Fleet status page shows the node (KPIs NODES 1 /
APPLIED 1 / STALE 0 / FAILED 0). Redundancy tab KPI STALE went 1\xe2\x86\x920 and
the row shows a real LAST SEEN timestamp. Bonus: FleetStatusHub
SignalR push now fires the cluster-page Live update banner on every
heartbeat because there are finally state changes to push.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Joseph Doherty
2026-05-25 02:22:59 -04:00
parent c8de58d6d3
commit f23e368a74
3 changed files with 187 additions and 11 deletions

View File

@@ -110,6 +110,66 @@ public sealed class GenerationRefreshHostedServiceTests : IDisposable
leases.OpenLeaseCount.ShouldBe(0, "IAsyncDisposable dispose must fire regardless of outcome");
}
// Bug #12 fix — verifies the previously-missing wiring: applies and heartbeats both
// emit sp_RegisterNodeGenerationApplied so Admin UI Fleet status + Redundancy LastSeenAt
// surface live state.
[Fact]
public async Task First_apply_reports_Applied_status_to_central_db()
{
var coordinator = await SeedCoordinatorAsync();
var leases = new ApplyLeaseRegistry();
var calls = new List<(long Gen, NodeApplyStatus Status, string? Error)>();
var service = NewService(coordinator, leases, currentGeneration: () => 42, registerCalls: calls);
await service.TickAsync(CancellationToken.None);
calls.Count.ShouldBe(1, "exactly one register call per apply window");
calls[0].Gen.ShouldBe(42);
calls[0].Status.ShouldBe(NodeApplyStatus.Applied);
calls[0].Error.ShouldBeNull();
}
[Fact]
public async Task No_change_tick_heartbeats_with_Applied_status()
{
var coordinator = await SeedCoordinatorAsync();
var leases = new ApplyLeaseRegistry();
var calls = new List<(long Gen, NodeApplyStatus Status, string? Error)>();
var service = NewService(coordinator, leases, currentGeneration: () => 42, registerCalls: calls);
await service.TickAsync(CancellationToken.None); // initial apply
await service.TickAsync(CancellationToken.None); // no-change heartbeat
await service.TickAsync(CancellationToken.None); // no-change heartbeat
calls.Count.ShouldBe(3, "one apply call + two heartbeat calls");
calls.ShouldAllBe(c => c.Gen == 42 && c.Status == NodeApplyStatus.Applied && c.Error == null);
}
[Fact]
public async Task Register_call_failure_does_not_break_apply_or_block_subsequent_ticks()
{
var coordinator = await SeedCoordinatorAsync();
var leases = new ApplyLeaseRegistry();
var registerCallCount = 0;
var service = new GenerationRefreshHostedService(
new NodeOptions { NodeId = "A", ClusterId = "c1", ConfigDbConnectionString = "unused" },
leases, coordinator, NullLogger<GenerationRefreshHostedService>.Instance,
tickInterval: TimeSpan.FromSeconds(1),
currentGenerationQuery: _ => Task.FromResult<long?>(42),
registerAppliedAsync: (gen, status, err, ct) =>
{
registerCallCount++;
throw new InvalidOperationException("simulated DB outage during register");
});
await service.TickAsync(CancellationToken.None); // apply succeeds, register throws
await service.TickAsync(CancellationToken.None); // heartbeat throws
registerCallCount.ShouldBe(2, "both register attempts must run");
service.LastAppliedGenerationId.ShouldBe(42, "register failure must not roll back the cursor");
}
// ---- fixture helpers ---------------------------------------------------
private async Task<RedundancyCoordinator> SeedCoordinatorAsync()
@@ -136,11 +196,15 @@ public sealed class GenerationRefreshHostedServiceTests : IDisposable
private static GenerationRefreshHostedService NewService(
RedundancyCoordinator coordinator,
ApplyLeaseRegistry leases,
Func<long?> currentGeneration) =>
Func<long?> currentGeneration,
List<(long Gen, NodeApplyStatus Status, string? Error)>? registerCalls = null) =>
new(new NodeOptions { NodeId = "A", ClusterId = "c1", ConfigDbConnectionString = "unused" },
leases, coordinator, NullLogger<GenerationRefreshHostedService>.Instance,
tickInterval: TimeSpan.FromSeconds(1),
currentGenerationQuery: _ => Task.FromResult(currentGeneration()));
currentGenerationQuery: _ => Task.FromResult(currentGeneration()),
registerAppliedAsync: registerCalls is null
? (_, _, _, _) => Task.CompletedTask
: (gen, status, err, _) => { registerCalls.Add((gen, status, err)); return Task.CompletedTask; });
private sealed class DbContextFactory(DbContextOptions<OtOpcUaConfigDbContext> options)
: IDbContextFactory<OtOpcUaConfigDbContext>