fix(host): register ActorSystem as DI singleton so health-probe scopes don't dispose it (HOST-021)
Per-probe health-check child scopes were disposing the AddTransient-bridged ActorSystem (IDisposable), terminating the live cluster node ~4s after boot and leaving every singleton-proxy Ask to hang the full 30s QueryTimeout — the central report pages (/notifications, /site-calls, /monitoring/health) loaded in ~30s. Bridge it as a singleton via a new lazy AkkaHostedService.GetOrCreateActorSystem() so child-scope disposal never touches it. Verified: 0 post-startup terminates, healthy active/standby, report pages ~0.05s, Playwright 68 passed / 0 failed.
This commit is contained in:
@@ -34,6 +34,14 @@ public class AkkaHostedService : IHostedService
|
||||
private readonly CommunicationOptions _communicationOptions;
|
||||
private readonly ILogger<AkkaHostedService> _logger;
|
||||
private ActorSystem? _actorSystem;
|
||||
|
||||
/// <summary>
|
||||
/// Guards the one-time creation of <see cref="_actorSystem"/> in
|
||||
/// <see cref="GetOrCreateActorSystem"/> so <see cref="StartAsync"/> and a concurrent
|
||||
/// health-probe resolution of the DI <see cref="ActorSystem"/> singleton race to create
|
||||
/// it exactly once (HOST-021).
|
||||
/// </summary>
|
||||
private readonly object _actorSystemLock = new();
|
||||
/// <summary>
|
||||
/// Auxiliary IDisposables (e.g. the SiteAuditTelemetryStalledTracker)
|
||||
/// that this hosted service constructs at start time and must tear down
|
||||
@@ -91,38 +99,18 @@ public class AkkaHostedService : IHostedService
|
||||
/// <returns>A task representing the asynchronous operation.</returns>
|
||||
public async Task StartAsync(CancellationToken cancellationToken)
|
||||
{
|
||||
// For site nodes, include a site-specific role (e.g., "site-SiteA") alongside the base role
|
||||
var roles = BuildRoles();
|
||||
|
||||
// WP-3: Transport heartbeat explicitly configured from CommunicationOptions (not framework defaults)
|
||||
var transportHeartbeatSec = _communicationOptions.TransportHeartbeatInterval.TotalSeconds;
|
||||
var transportFailureSec = _communicationOptions.TransportFailureThreshold.TotalSeconds;
|
||||
|
||||
// Host-006: HOCON is assembled in a dedicated builder that quotes/escapes every
|
||||
// interpolated value, so a hostname, seed node or strategy containing a quote,
|
||||
// backslash or whitespace cannot corrupt the configuration document.
|
||||
var hocon = BuildHocon(_nodeOptions, _clusterOptions, roles,
|
||||
_communicationOptions.TransportHeartbeatInterval,
|
||||
_communicationOptions.TransportFailureThreshold);
|
||||
|
||||
var config = ConfigurationFactory.ParseString(hocon);
|
||||
_actorSystem = ActorSystem.Create("scadabridge", config);
|
||||
|
||||
_logger.LogInformation(
|
||||
"Akka.NET actor system 'scadabridge' started. Role={Role}, Roles={Roles}, Hostname={Hostname}, Port={Port}, " +
|
||||
"TransportHeartbeat={TransportHeartbeat}s, TransportFailure={TransportFailure}s",
|
||||
_nodeOptions.Role,
|
||||
string.Join(", ", roles),
|
||||
_nodeOptions.NodeHostname,
|
||||
_nodeOptions.RemotingPort,
|
||||
transportHeartbeatSec,
|
||||
transportFailureSec);
|
||||
// HOST-021: create (or reuse) the externally-owned, process-singleton ActorSystem. A
|
||||
// health probe may already have created it via the DI singleton bridge
|
||||
// (GetOrCreateActorSystem) before this hosted service's StartAsync ran; either way the
|
||||
// call yields the one instance and sets _actorSystem. Actor registration below then
|
||||
// runs on it.
|
||||
var actorSystem = GetOrCreateActorSystem();
|
||||
|
||||
// Register the dead letter monitor actor
|
||||
var loggerFactory = _serviceProvider.GetRequiredService<ILoggerFactory>();
|
||||
var dlmLogger = loggerFactory.CreateLogger<DeadLetterMonitorActor>();
|
||||
var dlmHealthCollector = _serviceProvider.GetService<ZB.MOM.WW.ScadaBridge.HealthMonitoring.ISiteHealthCollector>();
|
||||
_actorSystem.ActorOf(
|
||||
actorSystem.ActorOf(
|
||||
Props.Create(() => new DeadLetterMonitorActor(dlmLogger, dlmHealthCollector)),
|
||||
"dead-letter-monitor");
|
||||
|
||||
@@ -137,6 +125,72 @@ public class AkkaHostedService : IHostedService
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns the process-wide Akka <see cref="ActorSystem"/>, creating it on first call.
|
||||
/// Idempotent and thread-safe: both <see cref="StartAsync"/> and the DI bridge that
|
||||
/// exposes the system to the shared <c>ZB.MOM.WW.Health.Akka</c> checks call this, and
|
||||
/// whichever runs first creates the system exactly once.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// HOST-021: the <see cref="ActorSystem"/> is an externally-owned process singleton — its
|
||||
/// lifecycle is this hosted service's (created here, torn down via
|
||||
/// <c>CoordinatedShutdown</c> in <see cref="StopAsync"/>). It MUST be registered in DI as a
|
||||
/// <b>singleton resolved through this method</b>, never as a transient/scoped factory:
|
||||
/// <see cref="ActorSystem"/> is <see cref="IDisposable"/>, and a transient/scoped factory
|
||||
/// hands a fresh disposable to every resolving child scope (e.g. each per-probe
|
||||
/// health-check scope), so the container disposes it when that scope ends —
|
||||
/// <c>ActorSystem.Dispose()</c> runs <c>CoordinatedShutdown(ActorSystemTerminateReason)</c>
|
||||
/// and tears the live cluster node down mid-flight, which is exactly the
|
||||
/// "central report pages hang 30s" defect this method fixes. Creating the system here and
|
||||
/// exposing it as a singleton keeps child-scope disposal away from it; routing the singleton
|
||||
/// through this method (rather than a plain <c>AddSingleton(sp => ...ActorSystem)</c>
|
||||
/// factory) also avoids caching a <c>null</c> if a health probe wins the startup race, since
|
||||
/// the first resolve creates the system instead of capturing a not-yet-started reference.
|
||||
/// </remarks>
|
||||
/// <returns>The single live actor system.</returns>
|
||||
public ActorSystem GetOrCreateActorSystem()
|
||||
{
|
||||
if (_actorSystem is not null)
|
||||
{
|
||||
return _actorSystem;
|
||||
}
|
||||
|
||||
lock (_actorSystemLock)
|
||||
{
|
||||
if (_actorSystem is not null)
|
||||
{
|
||||
return _actorSystem;
|
||||
}
|
||||
|
||||
// For site nodes, include a site-specific role (e.g., "site-SiteA") alongside the base role
|
||||
var roles = BuildRoles();
|
||||
|
||||
// Host-006: HOCON is assembled in a dedicated builder that quotes/escapes every
|
||||
// interpolated value, so a hostname, seed node or strategy containing a quote,
|
||||
// backslash or whitespace cannot corrupt the configuration document.
|
||||
var hocon = BuildHocon(_nodeOptions, _clusterOptions, roles,
|
||||
_communicationOptions.TransportHeartbeatInterval,
|
||||
_communicationOptions.TransportFailureThreshold);
|
||||
|
||||
var config = ConfigurationFactory.ParseString(hocon);
|
||||
var system = ActorSystem.Create("scadabridge", config);
|
||||
|
||||
_logger.LogInformation(
|
||||
"Akka.NET actor system 'scadabridge' started. Role={Role}, Roles={Roles}, Hostname={Hostname}, Port={Port}, " +
|
||||
"TransportHeartbeat={TransportHeartbeat}s, TransportFailure={TransportFailure}s",
|
||||
_nodeOptions.Role,
|
||||
string.Join(", ", roles),
|
||||
_nodeOptions.NodeHostname,
|
||||
_nodeOptions.RemotingPort,
|
||||
_communicationOptions.TransportHeartbeatInterval.TotalSeconds,
|
||||
_communicationOptions.TransportFailureThreshold.TotalSeconds);
|
||||
|
||||
// Publish last so a concurrent reader never observes a half-constructed system.
|
||||
_actorSystem = system;
|
||||
return _actorSystem;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Builds the Akka HOCON configuration document. Every interpolated value is
|
||||
/// routed through <see cref="QuoteHocon"/> (string values) so a hostname,
|
||||
|
||||
@@ -204,12 +204,17 @@ try
|
||||
builder.Services.AddSingleton<AkkaHostedService>();
|
||||
builder.Services.AddHostedService(sp => sp.GetRequiredService<AkkaHostedService>());
|
||||
|
||||
// The shared ZB.MOM.WW.Health Akka checks resolve ActorSystem from DI. ScadaBridge owns the
|
||||
// ActorSystem inside AkkaHostedService (not a DI singleton), so bridge it as TRANSIENT: each
|
||||
// resolve re-reads the current value — null while warming up (checks → Degraded), live after.
|
||||
// The factory must NOT throw: GetService<ActorSystem>() must return null (not raise) pre-start.
|
||||
builder.Services.AddTransient<Akka.Actor.ActorSystem>(sp =>
|
||||
sp.GetRequiredService<AkkaHostedService>().ActorSystem!);
|
||||
// HOST-021: bridge the AkkaHostedService-owned ActorSystem to DI as a SINGLETON via
|
||||
// GetOrCreateActorSystem(). The shared ZB.MOM.WW.Health Akka checks resolve ActorSystem
|
||||
// from DI, per probe, inside a child scope. ActorSystem is IDisposable, so a TRANSIENT
|
||||
// (or scoped) bridge is captured-and-disposed by each probe's scope — disposing the live
|
||||
// system mid-flight (CoordinatedShutdown/ActorSystemTerminateReason) and wedging the
|
||||
// central report pages at the 30s Ask timeout. A singleton is resolved from the root and
|
||||
// never disposed by a child scope; routing through GetOrCreateActorSystem (instead of a
|
||||
// plain singleton factory over .ActorSystem) means the first resolve CREATES the system
|
||||
// rather than caching a null if a probe wins the startup race.
|
||||
builder.Services.AddSingleton<Akka.Actor.ActorSystem>(sp =>
|
||||
sp.GetRequiredService<AkkaHostedService>().GetOrCreateActorSystem());
|
||||
|
||||
// InboundAPI-022: register the production IActiveNodeGate implementation so
|
||||
// standby-node gating is actually enforced (the InboundApiEndpointFilter
|
||||
|
||||
@@ -75,12 +75,17 @@ public static class SiteServiceRegistration
|
||||
services.AddSingleton<AkkaHostedService>();
|
||||
services.AddHostedService(sp => sp.GetRequiredService<AkkaHostedService>());
|
||||
|
||||
// The shared ZB.MOM.WW.Health Akka checks resolve ActorSystem from DI. ScadaBridge owns the
|
||||
// ActorSystem inside AkkaHostedService (not a DI singleton), so bridge it as TRANSIENT: each
|
||||
// resolve re-reads the current value — null while warming up (checks → Degraded), live after.
|
||||
// The factory must NOT throw: GetService<ActorSystem>() must return null (not raise) pre-start.
|
||||
services.AddTransient<Akka.Actor.ActorSystem>(sp =>
|
||||
sp.GetRequiredService<AkkaHostedService>().ActorSystem!);
|
||||
// HOST-021: bridge the AkkaHostedService-owned ActorSystem to DI as a SINGLETON via
|
||||
// GetOrCreateActorSystem(). The shared ZB.MOM.WW.Health Akka checks resolve ActorSystem
|
||||
// from DI, per probe, inside a child scope. ActorSystem is IDisposable, so a TRANSIENT
|
||||
// (or scoped) bridge is captured-and-disposed by each probe's scope — disposing the live
|
||||
// system mid-flight (CoordinatedShutdown/ActorSystemTerminateReason) and tearing down the
|
||||
// node. A singleton is resolved from the root and never disposed by a child scope; routing
|
||||
// through GetOrCreateActorSystem (instead of a plain singleton factory over .ActorSystem)
|
||||
// means the first resolve CREATES the system rather than caching a null if a probe wins
|
||||
// the startup race.
|
||||
services.AddSingleton<Akka.Actor.ActorSystem>(sp =>
|
||||
sp.GetRequiredService<AkkaHostedService>().GetOrCreateActorSystem());
|
||||
|
||||
// Cluster node status provider for health reports
|
||||
services.AddSingleton<IClusterNodeProvider>(sp =>
|
||||
|
||||
Reference in New Issue
Block a user