diff --git a/src/ZB.MOM.WW.ScadaBridge.Host/Health/ActiveNodeHealthCheck.cs b/src/ZB.MOM.WW.ScadaBridge.Host/Health/ActiveNodeHealthCheck.cs deleted file mode 100644 index 6e887e3c..00000000 --- a/src/ZB.MOM.WW.ScadaBridge.Host/Health/ActiveNodeHealthCheck.cs +++ /dev/null @@ -1,45 +0,0 @@ -using Akka.Cluster; -using Microsoft.Extensions.Diagnostics.HealthChecks; -using ZB.MOM.WW.ScadaBridge.Host.Actors; - -namespace ZB.MOM.WW.ScadaBridge.Host.Health; - -/// -/// Health check that returns healthy only if this node is the active (leader) node -/// in the Akka.NET cluster. Used by Traefik to route traffic to the active node. -/// -public class ActiveNodeHealthCheck : IHealthCheck -{ - private readonly AkkaHostedService _akkaService; - - /// Initializes a new with the given Akka hosted service. - /// The Akka hosted service providing access to the actor system and cluster state. - public ActiveNodeHealthCheck(AkkaHostedService akkaService) - { - _akkaService = akkaService; - } - - /// Returns healthy if this node is the cluster leader (active node); otherwise returns unhealthy. - /// Health check context providing registration details. - /// Cancellation token. - public Task CheckHealthAsync( - HealthCheckContext context, - CancellationToken cancellationToken = default) - { - var system = _akkaService.ActorSystem; - if (system == null) - return Task.FromResult(HealthCheckResult.Unhealthy("ActorSystem not yet available.")); - - var cluster = Cluster.Get(system); - var self = cluster.SelfMember; - - if (self.Status != MemberStatus.Up) - return Task.FromResult(HealthCheckResult.Unhealthy($"Node not Up (status: {self.Status}).")); - - var leader = cluster.State.Leader; - if (leader != null && leader == self.Address) - return Task.FromResult(HealthCheckResult.Healthy("Active node (cluster leader).")); - - return Task.FromResult(HealthCheckResult.Unhealthy("Standby node (not cluster leader).")); - } -} diff --git a/src/ZB.MOM.WW.ScadaBridge.Host/Health/AkkaClusterHealthCheck.cs b/src/ZB.MOM.WW.ScadaBridge.Host/Health/AkkaClusterHealthCheck.cs deleted file mode 100644 index 4356c7dd..00000000 --- a/src/ZB.MOM.WW.ScadaBridge.Host/Health/AkkaClusterHealthCheck.cs +++ /dev/null @@ -1,52 +0,0 @@ -using Akka.Cluster; -using Microsoft.Extensions.Diagnostics.HealthChecks; -using ZB.MOM.WW.ScadaBridge.Host.Actors; - -namespace ZB.MOM.WW.ScadaBridge.Host.Health; - -/// -/// Health check that verifies this node is an active member of the Akka.NET cluster. -/// Returns healthy only if the node's self-member status is Up or Joining. -/// -public class AkkaClusterHealthCheck : IHealthCheck -{ - private readonly AkkaHostedService _akkaService; - - /// - /// Initializes the health check with the Akka hosted service. - /// - /// The hosted service providing access to the Akka actor system. - public AkkaClusterHealthCheck(AkkaHostedService akkaService) - { - _akkaService = akkaService; - } - - /// - /// Checks that this node is an active member of the Akka.NET cluster. - /// - /// Health check context. - /// Cancellation token. - public Task CheckHealthAsync( - HealthCheckContext context, - CancellationToken cancellationToken = default) - { - var system = _akkaService.ActorSystem; - if (system == null) - return Task.FromResult(HealthCheckResult.Degraded("ActorSystem not yet available.")); - - var cluster = Cluster.Get(system); - var status = cluster.SelfMember.Status; - - var result = status switch - { - MemberStatus.Up or MemberStatus.Joining => - HealthCheckResult.Healthy($"Akka cluster member status: {status}"), - MemberStatus.Leaving or MemberStatus.Exiting => - HealthCheckResult.Degraded($"Akka cluster member status: {status}"), - _ => - HealthCheckResult.Unhealthy($"Akka cluster member status: {status}") - }; - - return Task.FromResult(result); - } -} diff --git a/src/ZB.MOM.WW.ScadaBridge.Host/Health/DatabaseHealthCheck.cs b/src/ZB.MOM.WW.ScadaBridge.Host/Health/DatabaseHealthCheck.cs deleted file mode 100644 index beb3c3cb..00000000 --- a/src/ZB.MOM.WW.ScadaBridge.Host/Health/DatabaseHealthCheck.cs +++ /dev/null @@ -1,43 +0,0 @@ -using Microsoft.Extensions.Diagnostics.HealthChecks; -using ZB.MOM.WW.ScadaBridge.ConfigurationDatabase; - -namespace ZB.MOM.WW.ScadaBridge.Host.Health; - -/// -/// Health check that verifies database connectivity for Central nodes. -/// -public class DatabaseHealthCheck : IHealthCheck -{ - private readonly ScadaBridgeDbContext _dbContext; - - /// - /// Initializes a new . - /// - /// The EF Core database context used to test connectivity. - public DatabaseHealthCheck(ScadaBridgeDbContext dbContext) - { - _dbContext = dbContext; - } - - /// - /// Checks database connectivity by attempting to open a connection. - /// - /// Health check context providing failure status information. - /// Cancellation token for the check. - public async Task CheckHealthAsync( - HealthCheckContext context, - CancellationToken cancellationToken = default) - { - try - { - var canConnect = await _dbContext.Database.CanConnectAsync(cancellationToken); - return canConnect - ? HealthCheckResult.Healthy("Database connection is available.") - : HealthCheckResult.Unhealthy("Database connection failed."); - } - catch (Exception ex) - { - return HealthCheckResult.Unhealthy("Database connection failed.", ex); - } - } -} diff --git a/src/ZB.MOM.WW.ScadaBridge.Host/Program.cs b/src/ZB.MOM.WW.ScadaBridge.Host/Program.cs index b1146c5f..fddb5f1d 100644 --- a/src/ZB.MOM.WW.ScadaBridge.Host/Program.cs +++ b/src/ZB.MOM.WW.ScadaBridge.Host/Program.cs @@ -1,5 +1,6 @@ -using HealthChecks.UI.Client; -using Microsoft.AspNetCore.Diagnostics.HealthChecks; +using ZB.MOM.WW.Health; +using ZB.MOM.WW.Health.Akka; +using ZB.MOM.WW.Health.EntityFrameworkCore; using ZB.MOM.WW.ScadaBridge.AuditLog; using ZB.MOM.WW.ScadaBridge.CentralUI; using ZB.MOM.WW.ScadaBridge.ClusterInfrastructure; @@ -110,11 +111,25 @@ try ?? throw new InvalidOperationException("ScadaBridge:Database:ConfigurationDb connection string is required for Central role."); builder.Services.AddConfigurationDatabase(configDbConnectionString); - // WP-12: Health checks for readiness gating + // WP-12: Health checks for readiness gating — shared ZB.MOM.WW.Health probes. + // Check names and the ready/active tier split are preserved: database + akka-cluster + // carry the Ready tag (/health/ready), active-node carries the Active tag (/health/active). + // The Akka checks resolve ActorSystem from DI via the transient bridge registered below; + // the DatabaseHealthCheck resolves a scoped ScadaBridgeDbContext (no factory). builder.Services.AddHealthChecks() - .AddCheck("database") - .AddCheck("akka-cluster") - .AddCheck("active-node"); + .AddTypeActivatedCheck>( + "database", + failureStatus: null, + tags: new[] { ZbHealthTags.Ready }) + .AddTypeActivatedCheck( + "akka-cluster", + failureStatus: null, + tags: new[] { ZbHealthTags.Ready }, + args: AkkaClusterStatusPolicy.Default) + .AddTypeActivatedCheck( + "active-node", + failureStatus: null, + tags: new[] { ZbHealthTags.Active }); // WP-13: Akka.NET bootstrap via hosted service builder.Services.AddSingleton(); @@ -221,23 +236,17 @@ try && HttpMethods.IsPost(ctx.Request.Method), branch => branch.UseAuditWriteMiddleware()); - // WP-12: Map readiness endpoint — returns 503 until ready, 200 when ready. - // REQ-HOST-4a defines readiness as cluster membership + DB connectivity, - // explicitly NOT cluster leadership. The leader-only "active-node" check is - // excluded here so a fully operational standby central node reports ready; - // leadership is reported separately on /health/active. - app.MapHealthChecks("/health/ready", new HealthCheckOptions - { - Predicate = check => check.Name != "active-node", - ResponseWriter = UIResponseWriter.WriteHealthCheckUIResponse - }); - - // Active node endpoint — returns 200 only on the cluster leader; used by Traefik for routing - app.MapHealthChecks("/health/active", new HealthCheckOptions - { - Predicate = check => check.Name == "active-node", - ResponseWriter = UIResponseWriter.WriteHealthCheckUIResponse - }); + // WP-12: Map the canonical three-tier health endpoints in one call: + // /health/ready — Ready-tagged checks (database + akka-cluster). REQ-HOST-4a defines + // readiness as cluster membership + DB connectivity, explicitly NOT + // cluster leadership, so the leader-only active-node check is excluded + // (a fully operational standby central node still reports ready). + // /health/active — Active-tagged check (active-node); returns 200 only on the cluster + // leader; used by Traefik for routing. + // /healthz — bare process liveness; runs no checks (always 200 while the process + // is up). New tier added by adopting the shared library. + // All three are anonymous and use the canonical ZbHealthWriter JSON output. + app.MapZbHealth(); app.MapStaticAssets(); app.MapCentralUI(); diff --git a/tests/ZB.MOM.WW.ScadaBridge.Host.Tests/HealthCheckTests.cs b/tests/ZB.MOM.WW.ScadaBridge.Host.Tests/HealthCheckTests.cs index 7f0cec38..49754337 100644 --- a/tests/ZB.MOM.WW.ScadaBridge.Host.Tests/HealthCheckTests.cs +++ b/tests/ZB.MOM.WW.ScadaBridge.Host.Tests/HealthCheckTests.cs @@ -1,11 +1,20 @@ +using System.Linq; using Microsoft.AspNetCore.Mvc.Testing; using Microsoft.Extensions.Configuration; -using ZB.MOM.WW.ScadaBridge.Host.Health; +using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Diagnostics.HealthChecks; +using Microsoft.Extensions.Options; +using ZB.MOM.WW.Health; namespace ZB.MOM.WW.ScadaBridge.Host.Tests; /// -/// WP-12: Tests for /health/ready and /health/active endpoints. +/// WP-12: Tests for the three-tier health endpoints after adopting the shared +/// ZB.MOM.WW.Health probes. Verifies that /health/ready, /health/active and the new +/// /healthz tier are mapped, and that the readiness/active tier split is now carried by +/// the canonical (Ready for database + akka-cluster, Active for +/// active-node) rather than by check-name predicates. These are pure route/tag assertions +/// — they require no database, LDAP, or formed Akka cluster. /// public class HealthCheckTests : IDisposable { @@ -25,41 +34,49 @@ public class HealthCheckTests : IDisposable } } + private WebApplicationFactory CreateCentralFactory() + { + var factory = new WebApplicationFactory() + .WithWebHostBuilder(builder => + { + builder.ConfigureAppConfiguration((context, config) => + { + config.AddInMemoryCollection(new Dictionary + { + ["ScadaBridge:Node:NodeHostname"] = "localhost", + ["ScadaBridge:Node:RemotingPort"] = "0", + ["ScadaBridge:Cluster:SeedNodes:0"] = "akka.tcp://scadabridge@localhost:2551", + ["ScadaBridge:Cluster:SeedNodes:1"] = "akka.tcp://scadabridge@localhost:2552", + ["ScadaBridge:Database:SkipMigrations"] = "true", + }); + }); + builder.UseSetting("ScadaBridge:Node:Role", "Central"); + builder.UseSetting("ScadaBridge:Database:SkipMigrations", "true"); + }); + _disposables.Add(factory); + return factory; + } + + private static IEnumerable Registrations(WebApplicationFactory factory) => + factory.Services.GetRequiredService>().Value.Registrations; + [Fact] - public async Task HealthReady_Endpoint_ReturnsResponse() + public async Task HealthReady_Endpoint_IsMapped() { var previousEnv = Environment.GetEnvironmentVariable("DOTNET_ENVIRONMENT"); try { Environment.SetEnvironmentVariable("DOTNET_ENVIRONMENT", "Central"); - - var factory = new WebApplicationFactory() - .WithWebHostBuilder(builder => - { - builder.ConfigureAppConfiguration((context, config) => - { - config.AddInMemoryCollection(new Dictionary - { - ["ScadaBridge:Node:NodeHostname"] = "localhost", - ["ScadaBridge:Node:RemotingPort"] = "0", - ["ScadaBridge:Cluster:SeedNodes:0"] = "akka.tcp://scadabridge@localhost:2551", - ["ScadaBridge:Cluster:SeedNodes:1"] = "akka.tcp://scadabridge@localhost:2552", - ["ScadaBridge:Database:SkipMigrations"] = "true", - }); - }); - builder.UseSetting("ScadaBridge:Node:Role", "Central"); - builder.UseSetting("ScadaBridge:Database:SkipMigrations", "true"); - }); - _disposables.Add(factory); - + var factory = CreateCentralFactory(); var client = factory.CreateClient(); _disposables.Add(client); var response = await client.GetAsync("/health/ready"); - // The endpoint exists and returns a status code. - // With test infrastructure (no real DB), the database check may fail, - // so we accept either 200 (Healthy) or 503 (Unhealthy). + // The endpoint exists and returns a status code. With test infrastructure + // (no real DB / cluster) the readiness checks may report Unhealthy, so we + // accept either 200 (Healthy/Degraded) or 503 (Unhealthy) — never 404. + Assert.NotEqual(System.Net.HttpStatusCode.NotFound, response.StatusCode); Assert.True( response.StatusCode == System.Net.HttpStatusCode.OK || response.StatusCode == System.Net.HttpStatusCode.ServiceUnavailable, @@ -72,39 +89,19 @@ public class HealthCheckTests : IDisposable } [Fact] - public async Task HealthActive_Endpoint_ReturnsResponse() + public async Task HealthActive_Endpoint_IsMapped() { var previousEnv = Environment.GetEnvironmentVariable("DOTNET_ENVIRONMENT"); try { Environment.SetEnvironmentVariable("DOTNET_ENVIRONMENT", "Central"); - - var factory = new WebApplicationFactory() - .WithWebHostBuilder(builder => - { - builder.ConfigureAppConfiguration((context, config) => - { - config.AddInMemoryCollection(new Dictionary - { - ["ScadaBridge:Node:NodeHostname"] = "localhost", - ["ScadaBridge:Node:RemotingPort"] = "0", - ["ScadaBridge:Cluster:SeedNodes:0"] = "akka.tcp://scadabridge@localhost:2551", - ["ScadaBridge:Cluster:SeedNodes:1"] = "akka.tcp://scadabridge@localhost:2552", - ["ScadaBridge:Database:SkipMigrations"] = "true", - }); - }); - builder.UseSetting("ScadaBridge:Node:Role", "Central"); - builder.UseSetting("ScadaBridge:Database:SkipMigrations", "true"); - }); - _disposables.Add(factory); - + var factory = CreateCentralFactory(); var client = factory.CreateClient(); _disposables.Add(client); var response = await client.GetAsync("/health/active"); - // In test mode, the ActorSystem may not be fully available, - // so the active-node check returns 503 (Unhealthy). + Assert.NotEqual(System.Net.HttpStatusCode.NotFound, response.StatusCode); Assert.True( response.StatusCode == System.Net.HttpStatusCode.OK || response.StatusCode == System.Net.HttpStatusCode.ServiceUnavailable, @@ -117,46 +114,21 @@ public class HealthCheckTests : IDisposable } [Fact] - public async Task HealthReady_Endpoint_ExcludesActiveNodeCheck() + public async Task Healthz_LivenessEndpoint_IsMappedAndReturns200() { - // Host-001 regression: /health/ready must reflect cluster membership + DB - // connectivity only (REQ-HOST-4a), NOT cluster leadership. The leader-only - // "active-node" check belongs solely to /health/active. If /health/ready - // included "active-node", a fully operational standby central node would - // permanently report 503, breaking load-balancer failover readiness. + // New tier added by adopting the shared library: /healthz runs no checks, so it + // returns 200 as long as the process is up — independent of DB / cluster state. var previousEnv = Environment.GetEnvironmentVariable("DOTNET_ENVIRONMENT"); try { Environment.SetEnvironmentVariable("DOTNET_ENVIRONMENT", "Central"); - - var factory = new WebApplicationFactory() - .WithWebHostBuilder(builder => - { - builder.ConfigureAppConfiguration((context, config) => - { - config.AddInMemoryCollection(new Dictionary - { - ["ScadaBridge:Node:NodeHostname"] = "localhost", - ["ScadaBridge:Node:RemotingPort"] = "0", - ["ScadaBridge:Cluster:SeedNodes:0"] = "akka.tcp://scadabridge@localhost:2551", - ["ScadaBridge:Cluster:SeedNodes:1"] = "akka.tcp://scadabridge@localhost:2552", - ["ScadaBridge:Database:SkipMigrations"] = "true", - }); - }); - builder.UseSetting("ScadaBridge:Node:Role", "Central"); - builder.UseSetting("ScadaBridge:Database:SkipMigrations", "true"); - }); - _disposables.Add(factory); - + var factory = CreateCentralFactory(); var client = factory.CreateClient(); _disposables.Add(client); - var response = await client.GetAsync("/health/ready"); - var body = await response.Content.ReadAsStringAsync(); + var response = await client.GetAsync("/healthz"); - // The readiness body lists each executed check by name in its entries map. - // The leader-only "active-node" check must not be among them. - Assert.DoesNotContain("active-node", body); + Assert.Equal(System.Net.HttpStatusCode.OK, response.StatusCode); } finally { @@ -165,43 +137,54 @@ public class HealthCheckTests : IDisposable } [Fact] - public async Task ActiveNodeHealthCheck_SystemNotStarted_ReturnsUnhealthy() + public void ReadyTier_Carries_Database_And_AkkaCluster() { - // AkkaHostedService before StartAsync has ActorSystem == null. - // The integration test (HealthActive_Endpoint_ReturnsResponse) validates the full - // endpoint wiring. This test validates the null-system path via WebApplicationFactory - // where the ActorSystem may not be available. + // Host-001 regression guard: readiness reflects cluster membership + DB connectivity + // only (REQ-HOST-4a), NOT cluster leadership. The split is now carried by the Ready tag + // rather than a check-name predicate: database + akka-cluster are Ready-tagged, and the + // leader-only active-node check is NOT — so a fully operational standby central node + // still reports ready on /health/ready. var previousEnv = Environment.GetEnvironmentVariable("DOTNET_ENVIRONMENT"); try { Environment.SetEnvironmentVariable("DOTNET_ENVIRONMENT", "Central"); - var factory = new WebApplicationFactory() - .WithWebHostBuilder(builder => - { - builder.ConfigureAppConfiguration((context, config) => - { - config.AddInMemoryCollection(new Dictionary - { - ["ScadaBridge:Node:NodeHostname"] = "localhost", - ["ScadaBridge:Node:RemotingPort"] = "0", - ["ScadaBridge:Cluster:SeedNodes:0"] = "akka.tcp://scadabridge@localhost:2551", - ["ScadaBridge:Database:SkipMigrations"] = "true", - }); - }); - builder.UseSetting("ScadaBridge:Node:Role", "Central"); - builder.UseSetting("ScadaBridge:Database:SkipMigrations", "true"); - }); - _disposables.Add(factory); + var factory = CreateCentralFactory(); - var client = factory.CreateClient(); - _disposables.Add(client); + var registrations = Registrations(factory).ToDictionary(r => r.Name); - var response = await client.GetAsync("/health/active"); - var body = await response.Content.ReadAsStringAsync(); + Assert.True(registrations.ContainsKey("database"), "Expected a 'database' health check."); + Assert.True(registrations.ContainsKey("akka-cluster"), "Expected an 'akka-cluster' health check."); - // Active-node check returns 503 when ActorSystem is not yet available or not leader - Assert.Equal(System.Net.HttpStatusCode.ServiceUnavailable, response.StatusCode); - Assert.Contains("active-node", body); + Assert.Contains(ZbHealthTags.Ready, registrations["database"].Tags); + Assert.Contains(ZbHealthTags.Ready, registrations["akka-cluster"].Tags); + + // The leader-only active-node check must NOT be on the readiness tier. + Assert.DoesNotContain(ZbHealthTags.Ready, registrations["active-node"].Tags); + } + finally + { + Environment.SetEnvironmentVariable("DOTNET_ENVIRONMENT", previousEnv); + } + } + + [Fact] + public void ActiveTier_Carries_Only_ActiveNode() + { + // The active-node leader check carries the Active tag (→ /health/active); the readiness + // checks do not, so /health/active reports leadership alone. + var previousEnv = Environment.GetEnvironmentVariable("DOTNET_ENVIRONMENT"); + try + { + Environment.SetEnvironmentVariable("DOTNET_ENVIRONMENT", "Central"); + var factory = CreateCentralFactory(); + + var registrations = Registrations(factory).ToDictionary(r => r.Name); + + Assert.True(registrations.ContainsKey("active-node"), "Expected an 'active-node' health check."); + Assert.Contains(ZbHealthTags.Active, registrations["active-node"].Tags); + + Assert.DoesNotContain(ZbHealthTags.Active, registrations["database"].Tags); + Assert.DoesNotContain(ZbHealthTags.Active, registrations["akka-cluster"].Tags); } finally {