feat: adopt shared ZB.MOM.WW.Health probes; add /healthz; canonical writer

This commit is contained in:
Joseph Doherty
2026-06-01 13:46:49 -04:00
parent 2a7ff03718
commit bbff1d19b5
5 changed files with 124 additions and 272 deletions
@@ -1,11 +1,20 @@
using System.Linq;
using Microsoft.AspNetCore.Mvc.Testing;
using Microsoft.Extensions.Configuration;
using ZB.MOM.WW.ScadaBridge.Host.Health;
using Microsoft.Extensions.DependencyInjection;
using Microsoft.Extensions.Diagnostics.HealthChecks;
using Microsoft.Extensions.Options;
using ZB.MOM.WW.Health;
namespace ZB.MOM.WW.ScadaBridge.Host.Tests;
/// <summary>
/// WP-12: Tests for /health/ready and /health/active endpoints.
/// WP-12: Tests for the three-tier health endpoints after adopting the shared
/// ZB.MOM.WW.Health probes. Verifies that /health/ready, /health/active and the new
/// /healthz tier are mapped, and that the readiness/active tier split is now carried by
/// the canonical <see cref="ZbHealthTags"/> (Ready for database + akka-cluster, Active for
/// active-node) rather than by check-name predicates. These are pure route/tag assertions
/// — they require no database, LDAP, or formed Akka cluster.
/// </summary>
public class HealthCheckTests : IDisposable
{
@@ -25,41 +34,49 @@ public class HealthCheckTests : IDisposable
}
}
private WebApplicationFactory<Program> CreateCentralFactory()
{
var factory = new WebApplicationFactory<Program>()
.WithWebHostBuilder(builder =>
{
builder.ConfigureAppConfiguration((context, config) =>
{
config.AddInMemoryCollection(new Dictionary<string, string?>
{
["ScadaBridge:Node:NodeHostname"] = "localhost",
["ScadaBridge:Node:RemotingPort"] = "0",
["ScadaBridge:Cluster:SeedNodes:0"] = "akka.tcp://scadabridge@localhost:2551",
["ScadaBridge:Cluster:SeedNodes:1"] = "akka.tcp://scadabridge@localhost:2552",
["ScadaBridge:Database:SkipMigrations"] = "true",
});
});
builder.UseSetting("ScadaBridge:Node:Role", "Central");
builder.UseSetting("ScadaBridge:Database:SkipMigrations", "true");
});
_disposables.Add(factory);
return factory;
}
private static IEnumerable<HealthCheckRegistration> Registrations(WebApplicationFactory<Program> factory) =>
factory.Services.GetRequiredService<IOptions<HealthCheckServiceOptions>>().Value.Registrations;
[Fact]
public async Task HealthReady_Endpoint_ReturnsResponse()
public async Task HealthReady_Endpoint_IsMapped()
{
var previousEnv = Environment.GetEnvironmentVariable("DOTNET_ENVIRONMENT");
try
{
Environment.SetEnvironmentVariable("DOTNET_ENVIRONMENT", "Central");
var factory = new WebApplicationFactory<Program>()
.WithWebHostBuilder(builder =>
{
builder.ConfigureAppConfiguration((context, config) =>
{
config.AddInMemoryCollection(new Dictionary<string, string?>
{
["ScadaBridge:Node:NodeHostname"] = "localhost",
["ScadaBridge:Node:RemotingPort"] = "0",
["ScadaBridge:Cluster:SeedNodes:0"] = "akka.tcp://scadabridge@localhost:2551",
["ScadaBridge:Cluster:SeedNodes:1"] = "akka.tcp://scadabridge@localhost:2552",
["ScadaBridge:Database:SkipMigrations"] = "true",
});
});
builder.UseSetting("ScadaBridge:Node:Role", "Central");
builder.UseSetting("ScadaBridge:Database:SkipMigrations", "true");
});
_disposables.Add(factory);
var factory = CreateCentralFactory();
var client = factory.CreateClient();
_disposables.Add(client);
var response = await client.GetAsync("/health/ready");
// The endpoint exists and returns a status code.
// With test infrastructure (no real DB), the database check may fail,
// so we accept either 200 (Healthy) or 503 (Unhealthy).
// The endpoint exists and returns a status code. With test infrastructure
// (no real DB / cluster) the readiness checks may report Unhealthy, so we
// accept either 200 (Healthy/Degraded) or 503 (Unhealthy) — never 404.
Assert.NotEqual(System.Net.HttpStatusCode.NotFound, response.StatusCode);
Assert.True(
response.StatusCode == System.Net.HttpStatusCode.OK ||
response.StatusCode == System.Net.HttpStatusCode.ServiceUnavailable,
@@ -72,39 +89,19 @@ public class HealthCheckTests : IDisposable
}
[Fact]
public async Task HealthActive_Endpoint_ReturnsResponse()
public async Task HealthActive_Endpoint_IsMapped()
{
var previousEnv = Environment.GetEnvironmentVariable("DOTNET_ENVIRONMENT");
try
{
Environment.SetEnvironmentVariable("DOTNET_ENVIRONMENT", "Central");
var factory = new WebApplicationFactory<Program>()
.WithWebHostBuilder(builder =>
{
builder.ConfigureAppConfiguration((context, config) =>
{
config.AddInMemoryCollection(new Dictionary<string, string?>
{
["ScadaBridge:Node:NodeHostname"] = "localhost",
["ScadaBridge:Node:RemotingPort"] = "0",
["ScadaBridge:Cluster:SeedNodes:0"] = "akka.tcp://scadabridge@localhost:2551",
["ScadaBridge:Cluster:SeedNodes:1"] = "akka.tcp://scadabridge@localhost:2552",
["ScadaBridge:Database:SkipMigrations"] = "true",
});
});
builder.UseSetting("ScadaBridge:Node:Role", "Central");
builder.UseSetting("ScadaBridge:Database:SkipMigrations", "true");
});
_disposables.Add(factory);
var factory = CreateCentralFactory();
var client = factory.CreateClient();
_disposables.Add(client);
var response = await client.GetAsync("/health/active");
// In test mode, the ActorSystem may not be fully available,
// so the active-node check returns 503 (Unhealthy).
Assert.NotEqual(System.Net.HttpStatusCode.NotFound, response.StatusCode);
Assert.True(
response.StatusCode == System.Net.HttpStatusCode.OK ||
response.StatusCode == System.Net.HttpStatusCode.ServiceUnavailable,
@@ -117,46 +114,21 @@ public class HealthCheckTests : IDisposable
}
[Fact]
public async Task HealthReady_Endpoint_ExcludesActiveNodeCheck()
public async Task Healthz_LivenessEndpoint_IsMappedAndReturns200()
{
// Host-001 regression: /health/ready must reflect cluster membership + DB
// connectivity only (REQ-HOST-4a), NOT cluster leadership. The leader-only
// "active-node" check belongs solely to /health/active. If /health/ready
// included "active-node", a fully operational standby central node would
// permanently report 503, breaking load-balancer failover readiness.
// New tier added by adopting the shared library: /healthz runs no checks, so it
// returns 200 as long as the process is up — independent of DB / cluster state.
var previousEnv = Environment.GetEnvironmentVariable("DOTNET_ENVIRONMENT");
try
{
Environment.SetEnvironmentVariable("DOTNET_ENVIRONMENT", "Central");
var factory = new WebApplicationFactory<Program>()
.WithWebHostBuilder(builder =>
{
builder.ConfigureAppConfiguration((context, config) =>
{
config.AddInMemoryCollection(new Dictionary<string, string?>
{
["ScadaBridge:Node:NodeHostname"] = "localhost",
["ScadaBridge:Node:RemotingPort"] = "0",
["ScadaBridge:Cluster:SeedNodes:0"] = "akka.tcp://scadabridge@localhost:2551",
["ScadaBridge:Cluster:SeedNodes:1"] = "akka.tcp://scadabridge@localhost:2552",
["ScadaBridge:Database:SkipMigrations"] = "true",
});
});
builder.UseSetting("ScadaBridge:Node:Role", "Central");
builder.UseSetting("ScadaBridge:Database:SkipMigrations", "true");
});
_disposables.Add(factory);
var factory = CreateCentralFactory();
var client = factory.CreateClient();
_disposables.Add(client);
var response = await client.GetAsync("/health/ready");
var body = await response.Content.ReadAsStringAsync();
var response = await client.GetAsync("/healthz");
// The readiness body lists each executed check by name in its entries map.
// The leader-only "active-node" check must not be among them.
Assert.DoesNotContain("active-node", body);
Assert.Equal(System.Net.HttpStatusCode.OK, response.StatusCode);
}
finally
{
@@ -165,43 +137,54 @@ public class HealthCheckTests : IDisposable
}
[Fact]
public async Task ActiveNodeHealthCheck_SystemNotStarted_ReturnsUnhealthy()
public void ReadyTier_Carries_Database_And_AkkaCluster()
{
// AkkaHostedService before StartAsync has ActorSystem == null.
// The integration test (HealthActive_Endpoint_ReturnsResponse) validates the full
// endpoint wiring. This test validates the null-system path via WebApplicationFactory
// where the ActorSystem may not be available.
// Host-001 regression guard: readiness reflects cluster membership + DB connectivity
// only (REQ-HOST-4a), NOT cluster leadership. The split is now carried by the Ready tag
// rather than a check-name predicate: database + akka-cluster are Ready-tagged, and the
// leader-only active-node check is NOT — so a fully operational standby central node
// still reports ready on /health/ready.
var previousEnv = Environment.GetEnvironmentVariable("DOTNET_ENVIRONMENT");
try
{
Environment.SetEnvironmentVariable("DOTNET_ENVIRONMENT", "Central");
var factory = new WebApplicationFactory<Program>()
.WithWebHostBuilder(builder =>
{
builder.ConfigureAppConfiguration((context, config) =>
{
config.AddInMemoryCollection(new Dictionary<string, string?>
{
["ScadaBridge:Node:NodeHostname"] = "localhost",
["ScadaBridge:Node:RemotingPort"] = "0",
["ScadaBridge:Cluster:SeedNodes:0"] = "akka.tcp://scadabridge@localhost:2551",
["ScadaBridge:Database:SkipMigrations"] = "true",
});
});
builder.UseSetting("ScadaBridge:Node:Role", "Central");
builder.UseSetting("ScadaBridge:Database:SkipMigrations", "true");
});
_disposables.Add(factory);
var factory = CreateCentralFactory();
var client = factory.CreateClient();
_disposables.Add(client);
var registrations = Registrations(factory).ToDictionary(r => r.Name);
var response = await client.GetAsync("/health/active");
var body = await response.Content.ReadAsStringAsync();
Assert.True(registrations.ContainsKey("database"), "Expected a 'database' health check.");
Assert.True(registrations.ContainsKey("akka-cluster"), "Expected an 'akka-cluster' health check.");
// Active-node check returns 503 when ActorSystem is not yet available or not leader
Assert.Equal(System.Net.HttpStatusCode.ServiceUnavailable, response.StatusCode);
Assert.Contains("active-node", body);
Assert.Contains(ZbHealthTags.Ready, registrations["database"].Tags);
Assert.Contains(ZbHealthTags.Ready, registrations["akka-cluster"].Tags);
// The leader-only active-node check must NOT be on the readiness tier.
Assert.DoesNotContain(ZbHealthTags.Ready, registrations["active-node"].Tags);
}
finally
{
Environment.SetEnvironmentVariable("DOTNET_ENVIRONMENT", previousEnv);
}
}
[Fact]
public void ActiveTier_Carries_Only_ActiveNode()
{
// The active-node leader check carries the Active tag (→ /health/active); the readiness
// checks do not, so /health/active reports leadership alone.
var previousEnv = Environment.GetEnvironmentVariable("DOTNET_ENVIRONMENT");
try
{
Environment.SetEnvironmentVariable("DOTNET_ENVIRONMENT", "Central");
var factory = CreateCentralFactory();
var registrations = Registrations(factory).ToDictionary(r => r.Name);
Assert.True(registrations.ContainsKey("active-node"), "Expected an 'active-node' health check.");
Assert.Contains(ZbHealthTags.Active, registrations["active-node"].Tags);
Assert.DoesNotContain(ZbHealthTags.Active, registrations["database"].Tags);
Assert.DoesNotContain(ZbHealthTags.Active, registrations["akka-cluster"].Tags);
}
finally
{