From 253bec5a52ddbb3e673e27246c8bdd5995be39eb Mon Sep 17 00:00:00 2001 From: Joseph Doherty Date: Tue, 16 Jun 2026 06:48:52 -0400 Subject: [PATCH] feat(host): readiness gates on required cluster singletons (#28, M2.14) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit REQ-HOST-4a lists "required cluster singletons running (if applicable)" as a readiness criterion, but /health/ready only checked database + akka-cluster. Add a third Ready-tagged check, RequiredSingletonsHealthCheck, registered in the Central-role AddHealthChecks() chain (so it is naturally role-scoped — site nodes never run it). Probe: for each required central singleton, Ask its local ClusterSingletonProxy an Identify with a short bounded per-singleton timeout (~2s, probes run concurrently via Task.WhenAll). A non-null ActorIdentity.Subject within the timeout means the singleton is running and reachable through the proxy; a null subject or a timeout means unreachable → Unhealthy, naming the unreachable singleton(s). The check never throws (catch-all → Unhealthy) and resolves ActorSystem lazily from DI per probe (Unhealthy if Akka not yet up). Required-always set = the five singleton proxies created unconditionally in AkkaHostedService.RegisterCentralActors: notification-outbox, audit-log-ingest, site-call-audit, audit-log-purge, site-audit-reconciliation. There are no feature/config-gated central singletons today; any future gated singleton is the "if applicable" case and must NOT be added to the required set. Leadership-agnostic: the proxy reaches the singleton from either central node, so a ready standby still reports ready (readiness must not require cluster leadership — that is the Active tier's job). During a brief singleton handover the probe may time out and the node flaps to not-ready, which is correct (a node mid-handover is legitimately not fully ready); no retries, to keep the probe fast. Tests (TDD): RequiredSingletonsHealthCheckTests exercises the probe against a TestKit ActorSystem — all proxies present+reachable → Healthy; one missing → Unhealthy naming it; ActorSystem absent → Unhealthy, no throw. HealthCheckTests regression-guards the Ready tag + absence of the Active tag on the new check. --- ...illpending-m2-implementation.md.tasks.json | 4 +- docs/requirements/Component-Host.md | 2 + .../Health/RequiredSingletonsHealthCheck.cs | 177 ++++++++++++++++++ src/ZB.MOM.WW.ScadaBridge.Host/Program.cs | 12 ++ .../HealthCheckTests.cs | 9 + .../RequiredSingletonsHealthCheckTests.cs | 109 +++++++++++ 6 files changed, 311 insertions(+), 2 deletions(-) create mode 100644 src/ZB.MOM.WW.ScadaBridge.Host/Health/RequiredSingletonsHealthCheck.cs create mode 100644 tests/ZB.MOM.WW.ScadaBridge.Host.Tests/RequiredSingletonsHealthCheckTests.cs diff --git a/docs/plans/2026-06-15-stillpending-m2-implementation.md.tasks.json b/docs/plans/2026-06-15-stillpending-m2-implementation.md.tasks.json index b61da6d3..71d89631 100644 --- a/docs/plans/2026-06-15-stillpending-m2-implementation.md.tasks.json +++ b/docs/plans/2026-06-15-stillpending-m2-implementation.md.tasks.json @@ -14,8 +14,8 @@ {"id": 42, "ref": "M2.10", "subject": "M2.10 #18: CI grep-guard against UPDATE/DELETE on AuditLog", "class": "small", "status": "completed", "commits": ["e7b6fe3", "9cd62aa"]}, {"id": 43, "ref": "M2.11", "subject": "M2.11 #24: debug snapshot unknown-instance returns error", "class": "small", "status": "completed", "commits": ["dbf44b9", "d160c7f"]}, {"id": 44, "ref": "M2.12", "subject": "M2.12 #25: recursion-limit error to site event log", "class": "small", "status": "completed", "commits": ["f08038d", "e2b31a9"]}, - {"id": 45, "ref": "M2.13", "subject": "M2.13 #27: populate obtainable OPC UA/MxGateway transition fields", "class": "small", "status": "pending"}, - {"id": 46, "ref": "M2.14", "subject": "M2.14 #28: readiness gate checks required cluster singletons", "class": "standard", "status": "pending"}, + {"id": 45, "ref": "M2.13", "subject": "M2.13 #27: populate obtainable OPC UA/MxGateway transition fields", "class": "small", "status": "completed", "commits": ["722b866", "3945789"]}, + {"id": 46, "ref": "M2.14", "subject": "M2.14 #28: readiness gate checks required cluster singletons", "class": "standard", "status": "completed", "commits": ["a4d81fa"]}, {"id": 47, "ref": "M2.15", "subject": "M2.15 #29: register site active-node purge gate (DI)", "class": "small", "status": "pending"}, {"id": 48, "ref": "M2.16", "subject": "M2.16 #30: Health Monitoring consumes FailedWriteCount", "class": "small", "status": "pending"}, {"id": 49, "ref": "M2.17", "subject": "M2.17 #31: reconcile StateTransitionValidator delete-from-NotDeployed", "class": "small", "status": "pending"}, diff --git a/docs/requirements/Component-Host.md b/docs/requirements/Component-Host.md index 85f4fd91..17e56706 100644 --- a/docs/requirements/Component-Host.md +++ b/docs/requirements/Component-Host.md @@ -95,6 +95,8 @@ On central nodes, the ASP.NET Core web endpoints (Central UI, Inbound API) must - Database connectivity (MS SQL) is verified. - Required cluster singletons are running (if applicable). +These are implemented as three `Ready`-tagged health checks registered in the Central-role branch of `Program.cs` (so they are naturally role-scoped — site nodes do not run them): `database` (`DatabaseHealthCheck`), `akka-cluster` (`AkkaClusterHealthCheck`), and `required-singletons` (`RequiredSingletonsHealthCheck`). The last verifies each *required-always* central singleton is reachable by Asking its local `ClusterSingletonProxy` an `Identify` with a short bounded timeout (~2s, probes run concurrently) and treating a non-null `ActorIdentity.Subject` as reachable; any unreachable required singleton degrades the check to **Unhealthy**, naming it. The required-always set is the five unconditional central singletons: notification-outbox, audit-log-ingest, site-call-audit, audit-log-purge, and site-audit-reconciliation. Feature-gated singletons are the "if applicable" case and are not probed when their feature is off. The check is leadership-agnostic — the proxy reaches the singleton from either central node, so a ready standby still reports ready (readiness must NOT require cluster leadership; that is the `Active` tier's job). During a brief singleton handover the probe may momentarily time out and the node may flap to not-ready, which is correct: a node mid-handover is legitimately not fully ready (no retries are used, to keep readiness polling fast). + A standard ASP.NET Core health check endpoint (`/health/ready`) reports readiness status. The load balancer uses this endpoint to determine when to route traffic to the node. During startup or failover, the node returns `503 Service Unavailable` until ready. ### REQ-HOST-5: Windows Service Hosting diff --git a/src/ZB.MOM.WW.ScadaBridge.Host/Health/RequiredSingletonsHealthCheck.cs b/src/ZB.MOM.WW.ScadaBridge.Host/Health/RequiredSingletonsHealthCheck.cs new file mode 100644 index 00000000..b3e84441 --- /dev/null +++ b/src/ZB.MOM.WW.ScadaBridge.Host/Health/RequiredSingletonsHealthCheck.cs @@ -0,0 +1,177 @@ +using Akka.Actor; +using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Diagnostics.HealthChecks; +using Microsoft.Extensions.Logging; + +namespace ZB.MOM.WW.ScadaBridge.Host.Health; + +/// +/// M2.14 (#28): readiness check that verifies every required central cluster +/// singleton is reachable from this node, satisfying the "required cluster +/// singletons running (if applicable)" clause of REQ-HOST-4a. Register it +/// -tagged in the Central-role +/// AddHealthChecks() chain only, so it is naturally role-scoped (site nodes +/// never register it). +/// +/// +/// +/// Probe strategy. Each central singleton has a local +/// ClusterSingletonProxy actor (created unconditionally in +/// AkkaHostedService.RegisterCentralActors). The proxy actor exists locally +/// as soon as it is created, so merely resolving its path proves nothing about the +/// singleton itself. Instead we +/// the proxy an with a short bounded per-singleton timeout and +/// expect an whose is +/// non-null. The proxy buffers and forwards to the live singleton, so a non-null +/// Subject within the timeout means the singleton is running and reachable; a null +/// Subject or a timeout means it is unreachable. Probes run concurrently +/// () so the +/// whole check stays cheap and readiness polling stays fast. +/// +/// +/// Required-always vs if-applicable. All five central singleton proxies are +/// created unconditionally on a central node (there is no feature/config gate around +/// any of them), so all five are treated as required-always here. If a future +/// singleton is created behind a feature flag, it should NOT be added to +/// — "if applicable" means skip when its +/// feature is off. +/// +/// +/// Failover flakiness. During a brief singleton handover the singleton may be +/// momentarily unreachable through the proxy. The bounded per-singleton timeout maps +/// that to Unhealthy (we never throw and never retry — retries would make the probe +/// slow). Readiness flapping briefly during a failover is acceptable and correct: a +/// node mid-handover is legitimately not fully ready. We deliberately accept that +/// tradeoff rather than masking it with retries. +/// +/// +/// No leadership requirement. The proxy reaches the singleton from either node +/// (active or standby), so a ready standby still reports Healthy here — readiness must +/// NOT require cluster leadership (that is the Active tier's job). +/// +/// +/// The is resolved lazily from DI per probe, mirroring +/// AkkaClusterHealthCheck; if it is not yet available (startup race) the check +/// returns Unhealthy rather than throwing. +/// +/// +public sealed class RequiredSingletonsHealthCheck : IHealthCheck +{ + /// + /// Local actor names (under /user) of the ClusterSingletonProxy + /// actors for the singletons that must always be running on a central node. + /// Matches the unconditional proxy registrations in + /// AkkaHostedService.RegisterCentralActors. + /// + public static readonly IReadOnlyList RequiredSingletonProxyNames = new[] + { + "notification-outbox-proxy", + "audit-log-ingest-proxy", + "site-call-audit-proxy", + "audit-log-purge-proxy", + "site-audit-reconciliation-proxy", + }; + + // Short, bounded per-singleton timeout. Kept small so readiness polling stays + // fast; a singleton in mid-handover that does not answer within this window is + // (correctly) treated as momentarily unreachable. Do NOT add retries here. + private static readonly TimeSpan ProbeTimeout = TimeSpan.FromSeconds(2); + + private readonly IServiceProvider _serviceProvider; + private readonly ILogger _logger; + + /// Initializes a new . + /// + /// Application service provider; the is resolved lazily so the + /// check is startup-safe (Unhealthy, never throwing, if Akka is not yet up). + /// + /// Logger for diagnostic detail on unreachable singletons. + public RequiredSingletonsHealthCheck( + IServiceProvider serviceProvider, + ILogger logger) + { + _serviceProvider = serviceProvider ?? throw new ArgumentNullException(nameof(serviceProvider)); + _logger = logger ?? throw new ArgumentNullException(nameof(logger)); + } + + /// + public async Task CheckHealthAsync( + HealthCheckContext context, + CancellationToken cancellationToken = default) + { + // CheckHealthAsync must NEVER throw — catch everything and map to Unhealthy + // with a descriptive message. An escaping exception would be recorded as + // Unhealthy anyway, but a thrown exception loses the descriptive message. + try + { + var system = _serviceProvider.GetService(); + if (system is null) + return HealthCheckResult.Unhealthy("ActorSystem not yet available."); + + // Probe each required singleton concurrently so the whole check is bounded + // by ~ProbeTimeout, not the sum of the per-singleton timeouts. + var probes = RequiredSingletonProxyNames + .Select(name => ProbeAsync(system, name, cancellationToken)) + .ToArray(); + + var results = await Task.WhenAll(probes).ConfigureAwait(false); + + var unreachable = results + .Where(r => !r.Reachable) + .Select(r => r.Name) + .ToList(); + + if (unreachable.Count == 0) + return HealthCheckResult.Healthy( + $"All {RequiredSingletonProxyNames.Count} required cluster singletons are reachable."); + + var joined = string.Join(", ", unreachable); + _logger.LogWarning( + "Readiness degraded: required cluster singleton(s) unreachable: {Unreachable}", + joined); + return HealthCheckResult.Unhealthy( + $"Required cluster singleton(s) unreachable: {joined}."); + } + catch (Exception ex) + { + // Defensive: any unexpected failure (including OperationCanceledException + // on shutdown) degrades readiness rather than escaping the check. + return HealthCheckResult.Unhealthy( + "Failed to probe required cluster singletons.", ex); + } + } + + /// + /// Asks the named local proxy an with a bounded timeout. + /// Reachable iff a non-null comes back in time. + /// A null Subject (path not present) or a timeout/exception → not reachable. This + /// method itself never throws. + /// + private async Task<(string Name, bool Reachable)> ProbeAsync( + ActorSystem system, + string proxyName, + CancellationToken cancellationToken) + { + try + { + // ActorSelection so a missing path resolves an ActorIdentity with a null + // Subject (rather than throwing) within the bounded timeout. + var selection = system.ActorSelection($"/user/{proxyName}"); + using var cts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken); + cts.CancelAfter(ProbeTimeout); + + var identity = await selection + .Ask(new Identify(proxyName), ProbeTimeout, cts.Token) + .ConfigureAwait(false); + + return (proxyName, identity.Subject is not null); + } + catch (Exception) + { + // Timeout / cancellation / any failure → momentarily unreachable. Bounded, + // no retry — readiness may briefly flap during a singleton handover, which + // is the correct signal for a node mid-handover. + return (proxyName, false); + } + } +} diff --git a/src/ZB.MOM.WW.ScadaBridge.Host/Program.cs b/src/ZB.MOM.WW.ScadaBridge.Host/Program.cs index 5af0f3b5..022e296a 100644 --- a/src/ZB.MOM.WW.ScadaBridge.Host/Program.cs +++ b/src/ZB.MOM.WW.ScadaBridge.Host/Program.cs @@ -202,6 +202,18 @@ try failureStatus: null, tags: new[] { ZbHealthTags.Ready }, args: AkkaClusterStatusPolicy.Default) + // M2.14 (#28): readiness ALSO reflects "required cluster singletons running" + // (REQ-HOST-4a). Probes each central singleton's local ClusterSingletonProxy + // with a bounded Identify and degrades to Unhealthy if any required singleton + // is unreachable. Registered inside the Central-role branch (this is it) so the + // check is naturally role-scoped — site nodes never run it. It resolves + // ActorSystem from DI per probe, like the akka-cluster check above, and is + // leadership-agnostic so a ready standby still reports ready (the proxy reaches + // the singleton from either node). + .AddTypeActivatedCheck( + "required-singletons", + failureStatus: null, + tags: new[] { ZbHealthTags.Ready }) .AddTypeActivatedCheck( "active-node", failureStatus: null, diff --git a/tests/ZB.MOM.WW.ScadaBridge.Host.Tests/HealthCheckTests.cs b/tests/ZB.MOM.WW.ScadaBridge.Host.Tests/HealthCheckTests.cs index 49754337..547b39ea 100644 --- a/tests/ZB.MOM.WW.ScadaBridge.Host.Tests/HealthCheckTests.cs +++ b/tests/ZB.MOM.WW.ScadaBridge.Host.Tests/HealthCheckTests.cs @@ -158,6 +158,15 @@ public class HealthCheckTests : IDisposable Assert.Contains(ZbHealthTags.Ready, registrations["database"].Tags); Assert.Contains(ZbHealthTags.Ready, registrations["akka-cluster"].Tags); + // M2.14 (#28): readiness ALSO reflects "required cluster singletons running" + // (REQ-HOST-4a). The Central-only required-singletons check is Ready-tagged so + // it gates /health/ready alongside database + akka-cluster, but is leadership- + // agnostic (it does NOT carry the Active tag), so a ready standby stays ready. + Assert.True(registrations.ContainsKey("required-singletons"), + "Expected a 'required-singletons' health check."); + Assert.Contains(ZbHealthTags.Ready, registrations["required-singletons"].Tags); + Assert.DoesNotContain(ZbHealthTags.Active, registrations["required-singletons"].Tags); + // The leader-only active-node check must NOT be on the readiness tier. Assert.DoesNotContain(ZbHealthTags.Ready, registrations["active-node"].Tags); } diff --git a/tests/ZB.MOM.WW.ScadaBridge.Host.Tests/RequiredSingletonsHealthCheckTests.cs b/tests/ZB.MOM.WW.ScadaBridge.Host.Tests/RequiredSingletonsHealthCheckTests.cs new file mode 100644 index 00000000..fd6c281d --- /dev/null +++ b/tests/ZB.MOM.WW.ScadaBridge.Host.Tests/RequiredSingletonsHealthCheckTests.cs @@ -0,0 +1,109 @@ +using Akka.Actor; +using Akka.TestKit.Xunit2; +using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Diagnostics.HealthChecks; +using Microsoft.Extensions.Logging.Abstractions; +using ZB.MOM.WW.ScadaBridge.Host.Health; + +namespace ZB.MOM.WW.ScadaBridge.Host.Tests; + +/// +/// M2.14 (#28): unit tests for . +/// +/// The check probes each required central singleton through its local +/// ClusterSingletonProxy by Asking an with a short +/// bounded timeout and treating a non-null as +/// "reachable". These tests exercise that probe logic directly against a TestKit +/// : +/// +/// present + reachable proxy paths (live echo actors) → Healthy; +/// a missing proxy path (ActorSelection resolves a null Subject) → Unhealthy +/// naming the unreachable singleton. +/// +/// No WebApplicationFactory / DB / formed cluster is needed — the probe is just an +/// in-process Identify round-trip, so the tests are deterministic and fast. +/// +public class RequiredSingletonsHealthCheckTests : TestKit +{ + /// A minimal live actor that does nothing — its mere existence makes + /// an resolve a non-null Subject (i.e. "reachable"). + private sealed class EchoActor : ReceiveActor + { + } + + private IServiceProvider ProviderReturning(ActorSystem system) + { + var services = new ServiceCollection(); + services.AddSingleton(system); + return services.BuildServiceProvider(); + } + + private static async Task RunAsync(RequiredSingletonsHealthCheck check) + { + var context = new HealthCheckContext + { + Registration = new HealthCheckRegistration( + "required-singletons", check, failureStatus: null, tags: null), + }; + return await check.CheckHealthAsync(context, CancellationToken.None); + } + + [Fact] + public async Task AllRequiredSingletonProxiesReachable_ReportsHealthy() + { + // Create a live actor at every required proxy path so each Identify resolves + // a non-null Subject. + foreach (var name in RequiredSingletonsHealthCheck.RequiredSingletonProxyNames) + { + Sys.ActorOf(Props.Create(() => new EchoActor()), name); + } + + var check = new RequiredSingletonsHealthCheck( + ProviderReturning(Sys), + NullLogger.Instance); + + var result = await RunAsync(check); + + Assert.Equal(HealthStatus.Healthy, result.Status); + } + + [Fact] + public async Task OneRequiredSingletonUnreachable_ReportsUnhealthyNamingIt() + { + // Create all but one proxy. The missing one's ActorSelection resolves an + // ActorIdentity with a null Subject within the bounded timeout → unreachable. + var missing = RequiredSingletonsHealthCheck.RequiredSingletonProxyNames[0]; + foreach (var name in RequiredSingletonsHealthCheck.RequiredSingletonProxyNames) + { + if (name == missing) + continue; + Sys.ActorOf(Props.Create(() => new EchoActor()), name); + } + + var check = new RequiredSingletonsHealthCheck( + ProviderReturning(Sys), + NullLogger.Instance); + + var result = await RunAsync(check); + + Assert.Equal(HealthStatus.Unhealthy, result.Status); + Assert.NotNull(result.Description); + Assert.Contains(missing, result.Description!); + } + + [Fact] + public async Task ActorSystemNotYetAvailable_ReportsUnhealthy_DoesNotThrow() + { + // Startup race: ActorSystem not yet bridged into DI. The check must map this + // to Unhealthy (the node is not ready to serve) rather than throwing. + var emptyProvider = new ServiceCollection().BuildServiceProvider(); + + var check = new RequiredSingletonsHealthCheck( + emptyProvider, + NullLogger.Instance); + + var result = await RunAsync(check); + + Assert.Equal(HealthStatus.Unhealthy, result.Status); + } +}