feat(infra): add Traefik load balancer with active node health check for central cluster failover

Add ActiveNodeHealthCheck that returns 200 only on the Akka.NET cluster
leader, enabling Traefik to route traffic to the active central node and
automatically fail over when the leader changes. Also fixes AkkaClusterHealthCheck
to resolve ActorSystem from AkkaHostedService (was always null via DI).
This commit is contained in:
Joseph Doherty
2026-03-21 00:44:37 -04:00
parent 1a540f4f0a
commit 0a85a839a2
13 changed files with 368 additions and 30 deletions

View File

@@ -1,10 +1,11 @@
using Microsoft.AspNetCore.Mvc.Testing;
using Microsoft.Extensions.Configuration;
using ScadaLink.Host.Health;
namespace ScadaLink.Host.Tests;
/// <summary>
/// WP-12: Tests for /health/ready endpoint.
/// WP-12: Tests for /health/ready and /health/active endpoints.
/// </summary>
public class HealthCheckTests : IDisposable
{
@@ -63,4 +64,94 @@ public class HealthCheckTests : IDisposable
Environment.SetEnvironmentVariable("DOTNET_ENVIRONMENT", previousEnv);
}
}
[Fact]
public async Task HealthActive_Endpoint_ReturnsResponse()
{
var previousEnv = Environment.GetEnvironmentVariable("DOTNET_ENVIRONMENT");
try
{
Environment.SetEnvironmentVariable("DOTNET_ENVIRONMENT", "Central");
var factory = new WebApplicationFactory<Program>()
.WithWebHostBuilder(builder =>
{
builder.ConfigureAppConfiguration((context, config) =>
{
config.AddInMemoryCollection(new Dictionary<string, string?>
{
["ScadaLink:Node:NodeHostname"] = "localhost",
["ScadaLink:Node:RemotingPort"] = "0",
["ScadaLink:Cluster:SeedNodes:0"] = "akka.tcp://scadalink@localhost:2551",
["ScadaLink:Cluster:SeedNodes:1"] = "akka.tcp://scadalink@localhost:2552",
["ScadaLink:Database:SkipMigrations"] = "true",
});
});
builder.UseSetting("ScadaLink:Node:Role", "Central");
builder.UseSetting("ScadaLink:Database:SkipMigrations", "true");
});
_disposables.Add(factory);
var client = factory.CreateClient();
_disposables.Add(client);
var response = await client.GetAsync("/health/active");
// In test mode, the ActorSystem may not be fully available,
// so the active-node check returns 503 (Unhealthy).
Assert.True(
response.StatusCode == System.Net.HttpStatusCode.OK ||
response.StatusCode == System.Net.HttpStatusCode.ServiceUnavailable,
$"Expected 200 or 503, got {(int)response.StatusCode}");
}
finally
{
Environment.SetEnvironmentVariable("DOTNET_ENVIRONMENT", previousEnv);
}
}
[Fact]
public async Task ActiveNodeHealthCheck_SystemNotStarted_ReturnsUnhealthy()
{
// AkkaHostedService before StartAsync has ActorSystem == null.
// The integration test (HealthActive_Endpoint_ReturnsResponse) validates the full
// endpoint wiring. This test validates the null-system path via WebApplicationFactory
// where the ActorSystem may not be available.
var previousEnv = Environment.GetEnvironmentVariable("DOTNET_ENVIRONMENT");
try
{
Environment.SetEnvironmentVariable("DOTNET_ENVIRONMENT", "Central");
var factory = new WebApplicationFactory<Program>()
.WithWebHostBuilder(builder =>
{
builder.ConfigureAppConfiguration((context, config) =>
{
config.AddInMemoryCollection(new Dictionary<string, string?>
{
["ScadaLink:Node:NodeHostname"] = "localhost",
["ScadaLink:Node:RemotingPort"] = "0",
["ScadaLink:Cluster:SeedNodes:0"] = "akka.tcp://scadalink@localhost:2551",
["ScadaLink:Database:SkipMigrations"] = "true",
});
});
builder.UseSetting("ScadaLink:Node:Role", "Central");
builder.UseSetting("ScadaLink:Database:SkipMigrations", "true");
});
_disposables.Add(factory);
var client = factory.CreateClient();
_disposables.Add(client);
var response = await client.GetAsync("/health/active");
var body = await response.Content.ReadAsStringAsync();
// Active-node check returns 503 when ActorSystem is not yet available or not leader
Assert.Equal(System.Net.HttpStatusCode.ServiceUnavailable, response.StatusCode);
Assert.Contains("active-node", body);
}
finally
{
Environment.SetEnvironmentVariable("DOTNET_ENVIRONMENT", previousEnv);
}
}
}