feat(host): readiness gates on required cluster singletons (#28, M2.14)
REQ-HOST-4a lists "required cluster singletons running (if applicable)" as a readiness criterion, but /health/ready only checked database + akka-cluster. Add a third Ready-tagged check, RequiredSingletonsHealthCheck, registered in the Central-role AddHealthChecks() chain (so it is naturally role-scoped — site nodes never run it). Probe: for each required central singleton, Ask its local ClusterSingletonProxy an Identify with a short bounded per-singleton timeout (~2s, probes run concurrently via Task.WhenAll). A non-null ActorIdentity.Subject within the timeout means the singleton is running and reachable through the proxy; a null subject or a timeout means unreachable → Unhealthy, naming the unreachable singleton(s). The check never throws (catch-all → Unhealthy) and resolves ActorSystem lazily from DI per probe (Unhealthy if Akka not yet up). Required-always set = the five singleton proxies created unconditionally in AkkaHostedService.RegisterCentralActors: notification-outbox, audit-log-ingest, site-call-audit, audit-log-purge, site-audit-reconciliation. There are no feature/config-gated central singletons today; any future gated singleton is the "if applicable" case and must NOT be added to the required set. Leadership-agnostic: the proxy reaches the singleton from either central node, so a ready standby still reports ready (readiness must not require cluster leadership — that is the Active tier's job). During a brief singleton handover the probe may time out and the node flaps to not-ready, which is correct (a node mid-handover is legitimately not fully ready); no retries, to keep the probe fast. Tests (TDD): RequiredSingletonsHealthCheckTests exercises the probe against a TestKit ActorSystem — all proxies present+reachable → Healthy; one missing → Unhealthy naming it; ActorSystem absent → Unhealthy, no throw. HealthCheckTests regression-guards the Ready tag + absence of the Active tag on the new check.
This commit is contained in:
@@ -158,6 +158,15 @@ public class HealthCheckTests : IDisposable
|
||||
Assert.Contains(ZbHealthTags.Ready, registrations["database"].Tags);
|
||||
Assert.Contains(ZbHealthTags.Ready, registrations["akka-cluster"].Tags);
|
||||
|
||||
// M2.14 (#28): readiness ALSO reflects "required cluster singletons running"
|
||||
// (REQ-HOST-4a). The Central-only required-singletons check is Ready-tagged so
|
||||
// it gates /health/ready alongside database + akka-cluster, but is leadership-
|
||||
// agnostic (it does NOT carry the Active tag), so a ready standby stays ready.
|
||||
Assert.True(registrations.ContainsKey("required-singletons"),
|
||||
"Expected a 'required-singletons' health check.");
|
||||
Assert.Contains(ZbHealthTags.Ready, registrations["required-singletons"].Tags);
|
||||
Assert.DoesNotContain(ZbHealthTags.Active, registrations["required-singletons"].Tags);
|
||||
|
||||
// The leader-only active-node check must NOT be on the readiness tier.
|
||||
Assert.DoesNotContain(ZbHealthTags.Ready, registrations["active-node"].Tags);
|
||||
}
|
||||
|
||||
@@ -0,0 +1,109 @@
|
||||
using Akka.Actor;
|
||||
using Akka.TestKit.Xunit2;
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using Microsoft.Extensions.Diagnostics.HealthChecks;
|
||||
using Microsoft.Extensions.Logging.Abstractions;
|
||||
using ZB.MOM.WW.ScadaBridge.Host.Health;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.Host.Tests;
|
||||
|
||||
/// <summary>
|
||||
/// M2.14 (#28): unit tests for <see cref="RequiredSingletonsHealthCheck"/>.
|
||||
///
|
||||
/// The check probes each required central singleton through its local
|
||||
/// <c>ClusterSingletonProxy</c> by Asking an <see cref="Identify"/> with a short
|
||||
/// bounded timeout and treating a non-null <see cref="ActorIdentity.Subject"/> as
|
||||
/// "reachable". These tests exercise that probe logic directly against a TestKit
|
||||
/// <see cref="ActorSystem"/>:
|
||||
/// <list type="bullet">
|
||||
/// <item>present + reachable proxy paths (live echo actors) → Healthy;</item>
|
||||
/// <item>a missing proxy path (ActorSelection resolves a null Subject) → Unhealthy
|
||||
/// naming the unreachable singleton.</item>
|
||||
/// </list>
|
||||
/// No WebApplicationFactory / DB / formed cluster is needed — the probe is just an
|
||||
/// in-process Identify round-trip, so the tests are deterministic and fast.
|
||||
/// </summary>
|
||||
public class RequiredSingletonsHealthCheckTests : TestKit
|
||||
{
|
||||
/// <summary>A minimal live actor that does nothing — its mere existence makes
|
||||
/// an <see cref="Identify"/> resolve a non-null Subject (i.e. "reachable").</summary>
|
||||
private sealed class EchoActor : ReceiveActor
|
||||
{
|
||||
}
|
||||
|
||||
private IServiceProvider ProviderReturning(ActorSystem system)
|
||||
{
|
||||
var services = new ServiceCollection();
|
||||
services.AddSingleton(system);
|
||||
return services.BuildServiceProvider();
|
||||
}
|
||||
|
||||
private static async Task<HealthCheckResult> RunAsync(RequiredSingletonsHealthCheck check)
|
||||
{
|
||||
var context = new HealthCheckContext
|
||||
{
|
||||
Registration = new HealthCheckRegistration(
|
||||
"required-singletons", check, failureStatus: null, tags: null),
|
||||
};
|
||||
return await check.CheckHealthAsync(context, CancellationToken.None);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task AllRequiredSingletonProxiesReachable_ReportsHealthy()
|
||||
{
|
||||
// Create a live actor at every required proxy path so each Identify resolves
|
||||
// a non-null Subject.
|
||||
foreach (var name in RequiredSingletonsHealthCheck.RequiredSingletonProxyNames)
|
||||
{
|
||||
Sys.ActorOf(Props.Create(() => new EchoActor()), name);
|
||||
}
|
||||
|
||||
var check = new RequiredSingletonsHealthCheck(
|
||||
ProviderReturning(Sys),
|
||||
NullLogger<RequiredSingletonsHealthCheck>.Instance);
|
||||
|
||||
var result = await RunAsync(check);
|
||||
|
||||
Assert.Equal(HealthStatus.Healthy, result.Status);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task OneRequiredSingletonUnreachable_ReportsUnhealthyNamingIt()
|
||||
{
|
||||
// Create all but one proxy. The missing one's ActorSelection resolves an
|
||||
// ActorIdentity with a null Subject within the bounded timeout → unreachable.
|
||||
var missing = RequiredSingletonsHealthCheck.RequiredSingletonProxyNames[0];
|
||||
foreach (var name in RequiredSingletonsHealthCheck.RequiredSingletonProxyNames)
|
||||
{
|
||||
if (name == missing)
|
||||
continue;
|
||||
Sys.ActorOf(Props.Create(() => new EchoActor()), name);
|
||||
}
|
||||
|
||||
var check = new RequiredSingletonsHealthCheck(
|
||||
ProviderReturning(Sys),
|
||||
NullLogger<RequiredSingletonsHealthCheck>.Instance);
|
||||
|
||||
var result = await RunAsync(check);
|
||||
|
||||
Assert.Equal(HealthStatus.Unhealthy, result.Status);
|
||||
Assert.NotNull(result.Description);
|
||||
Assert.Contains(missing, result.Description!);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task ActorSystemNotYetAvailable_ReportsUnhealthy_DoesNotThrow()
|
||||
{
|
||||
// Startup race: ActorSystem not yet bridged into DI. The check must map this
|
||||
// to Unhealthy (the node is not ready to serve) rather than throwing.
|
||||
var emptyProvider = new ServiceCollection().BuildServiceProvider();
|
||||
|
||||
var check = new RequiredSingletonsHealthCheck(
|
||||
emptyProvider,
|
||||
NullLogger<RequiredSingletonsHealthCheck>.Instance);
|
||||
|
||||
var result = await RunAsync(check);
|
||||
|
||||
Assert.Equal(HealthStatus.Unhealthy, result.Status);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user