fix(host): register ActorSystem as DI singleton so health-probe scopes don't dispose it (HOST-021)

Per-probe health-check child scopes were disposing the AddTransient-bridged
ActorSystem (IDisposable), terminating the live cluster node ~4s after boot and
leaving every singleton-proxy Ask to hang the full 30s QueryTimeout — the central
report pages (/notifications, /site-calls, /monitoring/health) loaded in ~30s.
Bridge it as a singleton via a new lazy AkkaHostedService.GetOrCreateActorSystem()
so child-scope disposal never touches it. Verified: 0 post-startup terminates,
healthy active/standby, report pages ~0.05s, Playwright 68 passed / 0 failed.
This commit is contained in:
Joseph Doherty
2026-06-05 08:26:09 -04:00
parent 0783547a2d
commit d33617d65d
4 changed files with 328 additions and 39 deletions
@@ -34,6 +34,14 @@ public class AkkaHostedService : IHostedService
private readonly CommunicationOptions _communicationOptions;
private readonly ILogger<AkkaHostedService> _logger;
private ActorSystem? _actorSystem;
/// <summary>
/// Guards the one-time creation of <see cref="_actorSystem"/> in
/// <see cref="GetOrCreateActorSystem"/> so <see cref="StartAsync"/> and a concurrent
/// health-probe resolution of the DI <see cref="ActorSystem"/> singleton race to create
/// it exactly once (HOST-021).
/// </summary>
private readonly object _actorSystemLock = new();
/// <summary>
/// Auxiliary IDisposables (e.g. the SiteAuditTelemetryStalledTracker)
/// that this hosted service constructs at start time and must tear down
@@ -91,38 +99,18 @@ public class AkkaHostedService : IHostedService
/// <returns>A task representing the asynchronous operation.</returns>
public async Task StartAsync(CancellationToken cancellationToken)
{
// For site nodes, include a site-specific role (e.g., "site-SiteA") alongside the base role
var roles = BuildRoles();
// WP-3: Transport heartbeat explicitly configured from CommunicationOptions (not framework defaults)
var transportHeartbeatSec = _communicationOptions.TransportHeartbeatInterval.TotalSeconds;
var transportFailureSec = _communicationOptions.TransportFailureThreshold.TotalSeconds;
// Host-006: HOCON is assembled in a dedicated builder that quotes/escapes every
// interpolated value, so a hostname, seed node or strategy containing a quote,
// backslash or whitespace cannot corrupt the configuration document.
var hocon = BuildHocon(_nodeOptions, _clusterOptions, roles,
_communicationOptions.TransportHeartbeatInterval,
_communicationOptions.TransportFailureThreshold);
var config = ConfigurationFactory.ParseString(hocon);
_actorSystem = ActorSystem.Create("scadabridge", config);
_logger.LogInformation(
"Akka.NET actor system 'scadabridge' started. Role={Role}, Roles={Roles}, Hostname={Hostname}, Port={Port}, " +
"TransportHeartbeat={TransportHeartbeat}s, TransportFailure={TransportFailure}s",
_nodeOptions.Role,
string.Join(", ", roles),
_nodeOptions.NodeHostname,
_nodeOptions.RemotingPort,
transportHeartbeatSec,
transportFailureSec);
// HOST-021: create (or reuse) the externally-owned, process-singleton ActorSystem. A
// health probe may already have created it via the DI singleton bridge
// (GetOrCreateActorSystem) before this hosted service's StartAsync ran; either way the
// call yields the one instance and sets _actorSystem. Actor registration below then
// runs on it.
var actorSystem = GetOrCreateActorSystem();
// Register the dead letter monitor actor
var loggerFactory = _serviceProvider.GetRequiredService<ILoggerFactory>();
var dlmLogger = loggerFactory.CreateLogger<DeadLetterMonitorActor>();
var dlmHealthCollector = _serviceProvider.GetService<ZB.MOM.WW.ScadaBridge.HealthMonitoring.ISiteHealthCollector>();
_actorSystem.ActorOf(
actorSystem.ActorOf(
Props.Create(() => new DeadLetterMonitorActor(dlmLogger, dlmHealthCollector)),
"dead-letter-monitor");
@@ -137,6 +125,72 @@ public class AkkaHostedService : IHostedService
}
}
/// <summary>
/// Returns the process-wide Akka <see cref="ActorSystem"/>, creating it on first call.
/// Idempotent and thread-safe: both <see cref="StartAsync"/> and the DI bridge that
/// exposes the system to the shared <c>ZB.MOM.WW.Health.Akka</c> checks call this, and
/// whichever runs first creates the system exactly once.
/// </summary>
/// <remarks>
/// HOST-021: the <see cref="ActorSystem"/> is an externally-owned process singleton — its
/// lifecycle is this hosted service's (created here, torn down via
/// <c>CoordinatedShutdown</c> in <see cref="StopAsync"/>). It MUST be registered in DI as a
/// <b>singleton resolved through this method</b>, never as a transient/scoped factory:
/// <see cref="ActorSystem"/> is <see cref="IDisposable"/>, and a transient/scoped factory
/// hands a fresh disposable to every resolving child scope (e.g. each per-probe
/// health-check scope), so the container disposes it when that scope ends —
/// <c>ActorSystem.Dispose()</c> runs <c>CoordinatedShutdown(ActorSystemTerminateReason)</c>
/// and tears the live cluster node down mid-flight, which is exactly the
/// "central report pages hang 30s" defect this method fixes. Creating the system here and
/// exposing it as a singleton keeps child-scope disposal away from it; routing the singleton
/// through this method (rather than a plain <c>AddSingleton(sp =&gt; ...ActorSystem)</c>
/// factory) also avoids caching a <c>null</c> if a health probe wins the startup race, since
/// the first resolve creates the system instead of capturing a not-yet-started reference.
/// </remarks>
/// <returns>The single live actor system.</returns>
public ActorSystem GetOrCreateActorSystem()
{
if (_actorSystem is not null)
{
return _actorSystem;
}
lock (_actorSystemLock)
{
if (_actorSystem is not null)
{
return _actorSystem;
}
// For site nodes, include a site-specific role (e.g., "site-SiteA") alongside the base role
var roles = BuildRoles();
// Host-006: HOCON is assembled in a dedicated builder that quotes/escapes every
// interpolated value, so a hostname, seed node or strategy containing a quote,
// backslash or whitespace cannot corrupt the configuration document.
var hocon = BuildHocon(_nodeOptions, _clusterOptions, roles,
_communicationOptions.TransportHeartbeatInterval,
_communicationOptions.TransportFailureThreshold);
var config = ConfigurationFactory.ParseString(hocon);
var system = ActorSystem.Create("scadabridge", config);
_logger.LogInformation(
"Akka.NET actor system 'scadabridge' started. Role={Role}, Roles={Roles}, Hostname={Hostname}, Port={Port}, " +
"TransportHeartbeat={TransportHeartbeat}s, TransportFailure={TransportFailure}s",
_nodeOptions.Role,
string.Join(", ", roles),
_nodeOptions.NodeHostname,
_nodeOptions.RemotingPort,
_communicationOptions.TransportHeartbeatInterval.TotalSeconds,
_communicationOptions.TransportFailureThreshold.TotalSeconds);
// Publish last so a concurrent reader never observes a half-constructed system.
_actorSystem = system;
return _actorSystem;
}
}
/// <summary>
/// Builds the Akka HOCON configuration document. Every interpolated value is
/// routed through <see cref="QuoteHocon"/> (string values) so a hostname,
+11 -6
View File
@@ -204,12 +204,17 @@ try
builder.Services.AddSingleton<AkkaHostedService>();
builder.Services.AddHostedService(sp => sp.GetRequiredService<AkkaHostedService>());
// The shared ZB.MOM.WW.Health Akka checks resolve ActorSystem from DI. ScadaBridge owns the
// ActorSystem inside AkkaHostedService (not a DI singleton), so bridge it as TRANSIENT: each
// resolve re-reads the current value — null while warming up (checks → Degraded), live after.
// The factory must NOT throw: GetService<ActorSystem>() must return null (not raise) pre-start.
builder.Services.AddTransient<Akka.Actor.ActorSystem>(sp =>
sp.GetRequiredService<AkkaHostedService>().ActorSystem!);
// HOST-021: bridge the AkkaHostedService-owned ActorSystem to DI as a SINGLETON via
// GetOrCreateActorSystem(). The shared ZB.MOM.WW.Health Akka checks resolve ActorSystem
// from DI, per probe, inside a child scope. ActorSystem is IDisposable, so a TRANSIENT
// (or scoped) bridge is captured-and-disposed by each probe's scope — disposing the live
// system mid-flight (CoordinatedShutdown/ActorSystemTerminateReason) and wedging the
// central report pages at the 30s Ask timeout. A singleton is resolved from the root and
// never disposed by a child scope; routing through GetOrCreateActorSystem (instead of a
// plain singleton factory over .ActorSystem) means the first resolve CREATES the system
// rather than caching a null if a probe wins the startup race.
builder.Services.AddSingleton<Akka.Actor.ActorSystem>(sp =>
sp.GetRequiredService<AkkaHostedService>().GetOrCreateActorSystem());
// InboundAPI-022: register the production IActiveNodeGate implementation so
// standby-node gating is actually enforced (the InboundApiEndpointFilter
@@ -75,12 +75,17 @@ public static class SiteServiceRegistration
services.AddSingleton<AkkaHostedService>();
services.AddHostedService(sp => sp.GetRequiredService<AkkaHostedService>());
// The shared ZB.MOM.WW.Health Akka checks resolve ActorSystem from DI. ScadaBridge owns the
// ActorSystem inside AkkaHostedService (not a DI singleton), so bridge it as TRANSIENT: each
// resolve re-reads the current value — null while warming up (checks → Degraded), live after.
// The factory must NOT throw: GetService<ActorSystem>() must return null (not raise) pre-start.
services.AddTransient<Akka.Actor.ActorSystem>(sp =>
sp.GetRequiredService<AkkaHostedService>().ActorSystem!);
// HOST-021: bridge the AkkaHostedService-owned ActorSystem to DI as a SINGLETON via
// GetOrCreateActorSystem(). The shared ZB.MOM.WW.Health Akka checks resolve ActorSystem
// from DI, per probe, inside a child scope. ActorSystem is IDisposable, so a TRANSIENT
// (or scoped) bridge is captured-and-disposed by each probe's scope — disposing the live
// system mid-flight (CoordinatedShutdown/ActorSystemTerminateReason) and tearing down the
// node. A singleton is resolved from the root and never disposed by a child scope; routing
// through GetOrCreateActorSystem (instead of a plain singleton factory over .ActorSystem)
// means the first resolve CREATES the system rather than caching a null if a probe wins
// the startup race.
services.AddSingleton<Akka.Actor.ActorSystem>(sp =>
sp.GetRequiredService<AkkaHostedService>().GetOrCreateActorSystem());
// Cluster node status provider for health reports
services.AddSingleton<IClusterNodeProvider>(sp =>