feat(infra): add Traefik load balancer with active node health check for central cluster failover

Add ActiveNodeHealthCheck that returns 200 only on the Akka.NET cluster
leader, enabling Traefik to route traffic to the active central node and
automatically fail over when the leader changes. Also fixes AkkaClusterHealthCheck
to resolve ActorSystem from AkkaHostedService (was always null via DI).
This commit is contained in:
Joseph Doherty
2026-03-21 00:44:37 -04:00
parent 1a540f4f0a
commit 0a85a839a2
13 changed files with 368 additions and 30 deletions

View File

@@ -87,7 +87,8 @@ try
// WP-12: Health checks for readiness gating
builder.Services.AddHealthChecks()
.AddCheck<DatabaseHealthCheck>("database")
.AddCheck<AkkaClusterHealthCheck>("akka-cluster");
.AddCheck<AkkaClusterHealthCheck>("akka-cluster")
.AddCheck<ActiveNodeHealthCheck>("active-node");
// WP-13: Akka.NET bootstrap via hosted service
builder.Services.AddSingleton<AkkaHostedService>();
@@ -126,6 +127,13 @@ try
ResponseWriter = UIResponseWriter.WriteHealthCheckUIResponse
});
// Active node endpoint — returns 200 only on the cluster leader; used by Traefik for routing
app.MapHealthChecks("/health/active", new HealthCheckOptions
{
Predicate = check => check.Name == "active-node",
ResponseWriter = UIResponseWriter.WriteHealthCheckUIResponse
});
app.MapStaticAssets();
app.MapCentralUI<ScadaLink.Host.Components.App>();
app.MapInboundAPI();