feat(infra): add Traefik load balancer with active node health check for central cluster failover
Add ActiveNodeHealthCheck that returns 200 only on the Akka.NET cluster leader, enabling Traefik to route traffic to the active central node and automatically fail over when the leader changes. Also fixes AkkaClusterHealthCheck to resolve ActorSystem from AkkaHostedService (was always null via DI).
This commit is contained in:
40
src/ScadaLink.Host/Health/ActiveNodeHealthCheck.cs
Normal file
40
src/ScadaLink.Host/Health/ActiveNodeHealthCheck.cs
Normal file
@@ -0,0 +1,40 @@
|
||||
using Akka.Cluster;
|
||||
using Microsoft.Extensions.Diagnostics.HealthChecks;
|
||||
using ScadaLink.Host.Actors;
|
||||
|
||||
namespace ScadaLink.Host.Health;
|
||||
|
||||
/// <summary>
|
||||
/// Health check that returns healthy only if this node is the active (leader) node
|
||||
/// in the Akka.NET cluster. Used by Traefik to route traffic to the active node.
|
||||
/// </summary>
|
||||
public class ActiveNodeHealthCheck : IHealthCheck
|
||||
{
|
||||
private readonly AkkaHostedService _akkaService;
|
||||
|
||||
public ActiveNodeHealthCheck(AkkaHostedService akkaService)
|
||||
{
|
||||
_akkaService = akkaService;
|
||||
}
|
||||
|
||||
public Task<HealthCheckResult> CheckHealthAsync(
|
||||
HealthCheckContext context,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
var system = _akkaService.ActorSystem;
|
||||
if (system == null)
|
||||
return Task.FromResult(HealthCheckResult.Unhealthy("ActorSystem not yet available."));
|
||||
|
||||
var cluster = Cluster.Get(system);
|
||||
var self = cluster.SelfMember;
|
||||
|
||||
if (self.Status != MemberStatus.Up)
|
||||
return Task.FromResult(HealthCheckResult.Unhealthy($"Node not Up (status: {self.Status})."));
|
||||
|
||||
var leader = cluster.State.Leader;
|
||||
if (leader != null && leader == self.Address)
|
||||
return Task.FromResult(HealthCheckResult.Healthy("Active node (cluster leader)."));
|
||||
|
||||
return Task.FromResult(HealthCheckResult.Unhealthy("Standby node (not cluster leader)."));
|
||||
}
|
||||
}
|
||||
@@ -1,6 +1,6 @@
|
||||
using Akka.Actor;
|
||||
using Akka.Cluster;
|
||||
using Microsoft.Extensions.Diagnostics.HealthChecks;
|
||||
using ScadaLink.Host.Actors;
|
||||
|
||||
namespace ScadaLink.Host.Health;
|
||||
|
||||
@@ -10,21 +10,22 @@ namespace ScadaLink.Host.Health;
|
||||
/// </summary>
|
||||
public class AkkaClusterHealthCheck : IHealthCheck
|
||||
{
|
||||
private readonly ActorSystem? _system;
|
||||
private readonly AkkaHostedService _akkaService;
|
||||
|
||||
public AkkaClusterHealthCheck(ActorSystem? system = null)
|
||||
public AkkaClusterHealthCheck(AkkaHostedService akkaService)
|
||||
{
|
||||
_system = system;
|
||||
_akkaService = akkaService;
|
||||
}
|
||||
|
||||
public Task<HealthCheckResult> CheckHealthAsync(
|
||||
HealthCheckContext context,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
if (_system == null)
|
||||
var system = _akkaService.ActorSystem;
|
||||
if (system == null)
|
||||
return Task.FromResult(HealthCheckResult.Degraded("ActorSystem not yet available."));
|
||||
|
||||
var cluster = Cluster.Get(_system);
|
||||
var cluster = Cluster.Get(system);
|
||||
var status = cluster.SelfMember.Status;
|
||||
|
||||
var result = status switch
|
||||
|
||||
@@ -87,7 +87,8 @@ try
|
||||
// WP-12: Health checks for readiness gating
|
||||
builder.Services.AddHealthChecks()
|
||||
.AddCheck<DatabaseHealthCheck>("database")
|
||||
.AddCheck<AkkaClusterHealthCheck>("akka-cluster");
|
||||
.AddCheck<AkkaClusterHealthCheck>("akka-cluster")
|
||||
.AddCheck<ActiveNodeHealthCheck>("active-node");
|
||||
|
||||
// WP-13: Akka.NET bootstrap via hosted service
|
||||
builder.Services.AddSingleton<AkkaHostedService>();
|
||||
@@ -126,6 +127,13 @@ try
|
||||
ResponseWriter = UIResponseWriter.WriteHealthCheckUIResponse
|
||||
});
|
||||
|
||||
// Active node endpoint — returns 200 only on the cluster leader; used by Traefik for routing
|
||||
app.MapHealthChecks("/health/active", new HealthCheckOptions
|
||||
{
|
||||
Predicate = check => check.Name == "active-node",
|
||||
ResponseWriter = UIResponseWriter.WriteHealthCheckUIResponse
|
||||
});
|
||||
|
||||
app.MapStaticAssets();
|
||||
app.MapCentralUI<ScadaLink.Host.Components.App>();
|
||||
app.MapInboundAPI();
|
||||
|
||||
Reference in New Issue
Block a user