Files
ScadaBridge/src/ZB.MOM.WW.ScadaBridge.Host/SiteServiceRegistration.cs
T
Joseph Doherty d33617d65d fix(host): register ActorSystem as DI singleton so health-probe scopes don't dispose it (HOST-021)
Per-probe health-check child scopes were disposing the AddTransient-bridged
ActorSystem (IDisposable), terminating the live cluster node ~4s after boot and
leaving every singleton-proxy Ask to hang the full 30s QueryTimeout — the central
report pages (/notifications, /site-calls, /monitoring/health) loaded in ~30s.
Bridge it as a singleton via a new lazy AkkaHostedService.GetOrCreateActorSystem()
so child-scope disposal never touches it. Verified: 0 post-startup terminates,
healthy active/standby, report pages ~0.05s, Playwright 68 passed / 0 failed.
2026-06-05 08:26:09 -04:00

153 lines
9.2 KiB
C#

using ZB.MOM.WW.ScadaBridge.AuditLog;
using ZB.MOM.WW.ScadaBridge.ClusterInfrastructure;
using ZB.MOM.WW.ScadaBridge.Communication;
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Services;
using ZB.MOM.WW.ScadaBridge.Commons.Observability;
using ZB.MOM.WW.ScadaBridge.DataConnectionLayer;
using ZB.MOM.WW.ScadaBridge.ExternalSystemGateway;
using ZB.MOM.WW.ScadaBridge.HealthMonitoring;
using ZB.MOM.WW.ScadaBridge.Host.Actors;
using ZB.MOM.WW.ScadaBridge.Host.Health;
using ZB.MOM.WW.ScadaBridge.NotificationService;
using ZB.MOM.WW.ScadaBridge.SiteEventLogging;
using ZB.MOM.WW.ScadaBridge.SiteRuntime;
using ZB.MOM.WW.ScadaBridge.StoreAndForward;
using ZB.MOM.WW.Telemetry;
namespace ZB.MOM.WW.ScadaBridge.Host;
/// <summary>
/// Extracted site-role DI registrations so both Program.cs and tests
/// use the same composition root.
/// </summary>
public static class SiteServiceRegistration
{
/// <summary>Registers all DI services required for the site role.</summary>
/// <param name="services">The service collection to register into.</param>
/// <param name="config">Application configuration for options binding.</param>
public static void Configure(IServiceCollection services, IConfiguration config)
{
// Shared components
services.AddClusterInfrastructure();
services.AddCommunication();
services.AddSiteHealthMonitoring();
services.AddExternalSystemGateway();
// AddNotificationService() is intentionally NOT registered on the site path.
// Sites no longer deliver notifications over SMTP — a buffered notification is
// forwarded to the central cluster (via NotificationForwarder / SiteCommunicationActor),
// and central owns SMTP delivery through the Notification Outbox. The SMTP machinery
// (OAuth2TokenService, ISmtpClientWrapper) has no consumer on a site node.
// Health report transport: sends SiteHealthReport to SiteCommunicationActor via Akka
services.AddSingleton<ISiteIdentityProvider, SiteIdentityProvider>();
services.AddSingleton<IHealthReportTransport, AkkaHealthReportTransport>();
// Site-only components — AddSiteRuntime registers SiteStorageService with SQLite path
// and site-local repository implementations (IExternalSystemRepository, INotificationRepository)
var siteDbPath = config["ScadaBridge:Database:SiteDbPath"] ?? "site.db";
services.AddSiteRuntime($"Data Source={siteDbPath}");
services.AddDataConnectionLayer();
// Audit Log #23 (M3 Bundle F): adapter that surfaces the site id to
// StoreAndForwardService through DI WITHOUT introducing a
// StoreAndForward → HealthMonitoring project-reference cycle. Must be
// registered BEFORE AddStoreAndForward so the S&F factory resolves a
// non-empty SiteId at construction time (otherwise the S&F service is
// a singleton and the empty-string value would be cached for the
// lifetime of the process).
services.AddSingleton<ZB.MOM.WW.ScadaBridge.StoreAndForward.IStoreAndForwardSiteContext, StoreAndForwardSiteContext>();
services.AddStoreAndForward();
services.AddSiteEventLogging();
// Audit Log (#23) — site-side hot-path writer + telemetry collaborators.
// The SiteAuditTelemetryActor itself is registered by AkkaHostedService
// in the site-role block; this call wires every DI dependency it (and
// ScriptRuntimeContext, when Bundle F lands) reaches for.
services.AddAuditLog(config);
// Audit Log (#23) M2 Bundle G — bridge FallbackAuditWriter primary
// failures into the site health report payload as
// SiteAuditWriteFailures. Must come AFTER both AddSiteHealthMonitoring
// (registers ISiteHealthCollector) and AddAuditLog (registers the
// NoOp default this call replaces).
services.AddAuditLogHealthMetricsBridge();
// WP-13: Akka.NET bootstrap via hosted service
services.AddSingleton<AkkaHostedService>();
services.AddHostedService(sp => sp.GetRequiredService<AkkaHostedService>());
// HOST-021: bridge the AkkaHostedService-owned ActorSystem to DI as a SINGLETON via
// GetOrCreateActorSystem(). The shared ZB.MOM.WW.Health Akka checks resolve ActorSystem
// from DI, per probe, inside a child scope. ActorSystem is IDisposable, so a TRANSIENT
// (or scoped) bridge is captured-and-disposed by each probe's scope — disposing the live
// system mid-flight (CoordinatedShutdown/ActorSystemTerminateReason) and tearing down the
// node. A singleton is resolved from the root and never disposed by a child scope; routing
// through GetOrCreateActorSystem (instead of a plain singleton factory over .ActorSystem)
// means the first resolve CREATES the system rather than caching a null if a probe wins
// the startup race.
services.AddSingleton<Akka.Actor.ActorSystem>(sp =>
sp.GetRequiredService<AkkaHostedService>().GetOrCreateActorSystem());
// Cluster node status provider for health reports
services.AddSingleton<IClusterNodeProvider>(sp =>
{
var akkaService = sp.GetRequiredService<AkkaHostedService>();
var nodeOptions = sp.GetRequiredService<Microsoft.Extensions.Options.IOptions<NodeOptions>>().Value;
var siteRole = $"site-{nodeOptions.SiteId}";
return new AkkaClusterNodeProvider(akkaService, siteRole);
});
// Options binding
BindSharedOptions(services, config);
services.Configure<SiteRuntimeOptions>(config.GetSection("ScadaBridge:SiteRuntime"));
services.Configure<DataConnectionOptions>(config.GetSection("ScadaBridge:DataConnection"));
services.Configure<StoreAndForwardOptions>(config.GetSection("ScadaBridge:StoreAndForward"));
services.Configure<SiteEventLogOptions>(config.GetSection("ScadaBridge:SiteEventLog"));
}
/// <summary>Binds shared options sections (Node, Cluster, Database, Communication, etc.) used by both site and central roles.</summary>
/// <param name="services">The service collection to bind options into.</param>
/// <param name="config">Application configuration supplying the option values.</param>
public static void BindSharedOptions(IServiceCollection services, IConfiguration config)
{
services.Configure<NodeOptions>(config.GetSection("ScadaBridge:Node"));
// Bind + eagerly validate: ClusterOptionsValidator is registered (TryAddEnumerable)
// by the ClusterInfrastructure module, so chaining ValidateOnStart() here makes a bad
// ScadaBridge:Cluster section fail fast at host build instead of lazily on first resolve.
services.AddOptions<ClusterOptions>().Bind(config.GetSection("ScadaBridge:Cluster")).ValidateOnStart();
services.Configure<DatabaseOptions>(config.GetSection("ScadaBridge:Database"));
services.Configure<CommunicationOptions>(config.GetSection("ScadaBridge:Communication"));
// Bind + eagerly validate: HealthMonitoringOptionsValidator is registered (TryAddEnumerable)
// by the HealthMonitoring module, so chaining ValidateOnStart() here makes a bad
// ScadaBridge:HealthMonitoring section fail fast at host build instead of lazily on first resolve.
services.AddOptions<HealthMonitoringOptions>().Bind(config.GetSection("ScadaBridge:HealthMonitoring")).ValidateOnStart();
services.Configure<NotificationOptions>(config.GetSection("ScadaBridge:Notification"));
services.Configure<LoggingOptions>(config.GetSection("ScadaBridge:Logging"));
// Audit Log (#23) — exposes ScadaBridge:Node:NodeName to downstream audit
// writers so they can stamp the SourceNode column. Registered here in
// shared bootstrap because every node (central + site) needs it.
services.AddSingleton<INodeIdentityProvider, NodeIdentityProvider>();
// Observability — shared ZB.MOM.WW.Telemetry. Registered in shared bootstrap so
// BOTH the central and site composition roots wire the OTel Resource (the
// service.name/site.id/node.role identity triple) + standard instrumentation +
// the always-on Prometheus exporter. Mount the /metrics scrape endpoint per role
// with app.MapZbMetrics(). The same `?? "central"` SiteId default Program.cs uses
// is applied here so the Resource attribute matches the log-enricher value.
// The application meter is named so OTel observes its instruments; emit points are
// wired by follow-on tasks (the instruments are no-op until a listener attaches).
services.AddZbTelemetry(o =>
{
o.ServiceName = "scadabridge";
o.SiteId = config["ScadaBridge:Node:SiteId"] ?? "central";
o.NodeRole = config["ScadaBridge:Node:Role"];
o.Meters = [ScadaBridgeTelemetry.MeterName];
if (Enum.TryParse<ZbExporter>(config["ScadaBridge:Telemetry:Exporter"], ignoreCase: true, out var exporter))
o.Exporter = exporter;
var otlp = config["ScadaBridge:Telemetry:OtlpEndpoint"];
if (!string.IsNullOrWhiteSpace(otlp))
o.OtlpEndpoint = otlp;
});
}
}