1149 lines
62 KiB
C#
1149 lines
62 KiB
C#
using System.Collections.Immutable;
|
|
using Akka.Actor;
|
|
using Akka.Cluster;
|
|
using Akka.Cluster.Tools.Client;
|
|
using Akka.Cluster.Tools.Singleton;
|
|
using Akka.Configuration;
|
|
using Microsoft.Extensions.Options;
|
|
using ZB.MOM.WW.ScadaBridge.ClusterInfrastructure;
|
|
using ZB.MOM.WW.ScadaBridge.Communication;
|
|
using ZB.MOM.WW.ScadaBridge.Communication.Actors;
|
|
using ZB.MOM.WW.ScadaBridge.Host.Actors;
|
|
using ZB.MOM.WW.ScadaBridge.SiteRuntime;
|
|
using ZB.MOM.WW.ScadaBridge.SiteRuntime.Actors;
|
|
using ZB.MOM.WW.ScadaBridge.SiteRuntime.Messages;
|
|
using ZB.MOM.WW.ScadaBridge.SiteRuntime.Persistence;
|
|
using ZB.MOM.WW.ScadaBridge.SiteRuntime.Scripts;
|
|
using ZB.MOM.WW.ScadaBridge.SiteRuntime.Streaming;
|
|
using ZB.MOM.WW.ScadaBridge.StoreAndForward;
|
|
|
|
namespace ZB.MOM.WW.ScadaBridge.Host.Actors;
|
|
|
|
/// <summary>
|
|
/// Hosted service that manages the Akka.NET actor system lifecycle.
|
|
/// Creates the actor system on start, registers actors, and triggers
|
|
/// CoordinatedShutdown on stop.
|
|
///
|
|
/// WP-3: Transport heartbeat is explicitly configured in HOCON from CommunicationOptions.
|
|
/// </summary>
|
|
public class AkkaHostedService : IHostedService
|
|
{
|
|
private readonly IServiceProvider _serviceProvider;
|
|
private readonly NodeOptions _nodeOptions;
|
|
private readonly ClusterOptions _clusterOptions;
|
|
private readonly CommunicationOptions _communicationOptions;
|
|
private readonly ILogger<AkkaHostedService> _logger;
|
|
private ActorSystem? _actorSystem;
|
|
|
|
/// <summary>
|
|
/// Guards the one-time creation of <see cref="_actorSystem"/> in
|
|
/// <see cref="GetOrCreateActorSystem"/> so <see cref="StartAsync"/> and a concurrent
|
|
/// health-probe resolution of the DI <see cref="ActorSystem"/> singleton race to create
|
|
/// it exactly once (HOST-021).
|
|
/// </summary>
|
|
private readonly object _actorSystemLock = new();
|
|
/// <summary>
|
|
/// Auxiliary IDisposables (e.g. the SiteAuditTelemetryStalledTracker)
|
|
/// that this hosted service constructs at start time and must tear down
|
|
/// on shutdown — they don't fit the ActorSystem lifecycle but share its
|
|
/// process scope.
|
|
/// </summary>
|
|
private readonly List<IDisposable> _trackedDisposables = new();
|
|
|
|
/// <summary>
|
|
/// NotificationService-020 guard: sentinel that flips to <c>true</c> the
|
|
/// first time a Notification-category S&F delivery handler is registered
|
|
/// on this hosted service instance. <see cref="StoreAndForwardService.RegisterDeliveryHandler"/>
|
|
/// is last-write-wins on category, so a future code change that introduces
|
|
/// a second registration path (e.g. a role-branch + helper that both call
|
|
/// the registration) would silently overwrite the canonical
|
|
/// <c>NotificationForwarder</c> handler with whatever the loser registers —
|
|
/// the prior NS-001 fix did exactly this, and was silently superseded
|
|
/// when the central-only redesign moved delivery to <c>NotificationOutbox</c>.
|
|
/// This sentinel makes the duplicate noisy at startup so a maintainer
|
|
/// re-introducing the second path sees it immediately.
|
|
/// </summary>
|
|
private bool _notificationDeliveryHandlerRegistered;
|
|
|
|
/// <summary>
|
|
/// Initializes a new instance of the <see cref="AkkaHostedService"/> class.
|
|
/// </summary>
|
|
/// <param name="serviceProvider">The service provider for accessing dependencies.</param>
|
|
/// <param name="nodeOptions">The node configuration options.</param>
|
|
/// <param name="clusterOptions">The cluster configuration options.</param>
|
|
/// <param name="communicationOptions">The communication configuration options.</param>
|
|
/// <param name="logger">The logger instance.</param>
|
|
public AkkaHostedService(
|
|
IServiceProvider serviceProvider,
|
|
IOptions<NodeOptions> nodeOptions,
|
|
IOptions<ClusterOptions> clusterOptions,
|
|
IOptions<CommunicationOptions> communicationOptions,
|
|
ILogger<AkkaHostedService> logger)
|
|
{
|
|
_serviceProvider = serviceProvider;
|
|
_nodeOptions = nodeOptions.Value;
|
|
_clusterOptions = clusterOptions.Value;
|
|
_communicationOptions = communicationOptions.Value;
|
|
_logger = logger;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Gets the actor system once started. Null before StartAsync completes.
|
|
/// </summary>
|
|
public ActorSystem? ActorSystem => _actorSystem;
|
|
|
|
/// <summary>
|
|
/// Starts the Akka.NET actor system and registers actors.
|
|
/// </summary>
|
|
/// <param name="cancellationToken">A cancellation token that can be used to cancel the operation.</param>
|
|
/// <returns>A task representing the asynchronous operation.</returns>
|
|
public async Task StartAsync(CancellationToken cancellationToken)
|
|
{
|
|
// HOST-021: create (or reuse) the externally-owned, process-singleton ActorSystem. A
|
|
// health probe may already have created it via the DI singleton bridge
|
|
// (GetOrCreateActorSystem) before this hosted service's StartAsync ran; either way the
|
|
// call yields the one instance and sets _actorSystem. Actor registration below then
|
|
// runs on it.
|
|
var actorSystem = GetOrCreateActorSystem();
|
|
|
|
// Register the dead letter monitor actor
|
|
var loggerFactory = _serviceProvider.GetRequiredService<ILoggerFactory>();
|
|
var dlmLogger = loggerFactory.CreateLogger<DeadLetterMonitorActor>();
|
|
var dlmHealthCollector = _serviceProvider.GetService<ZB.MOM.WW.ScadaBridge.HealthMonitoring.ISiteHealthCollector>();
|
|
actorSystem.ActorOf(
|
|
Props.Create(() => new DeadLetterMonitorActor(dlmLogger, dlmHealthCollector)),
|
|
"dead-letter-monitor");
|
|
|
|
// Register role-specific actors
|
|
if (_nodeOptions.Role.Equals("Central", StringComparison.OrdinalIgnoreCase))
|
|
{
|
|
RegisterCentralActors();
|
|
}
|
|
else if (_nodeOptions.Role.Equals("Site", StringComparison.OrdinalIgnoreCase))
|
|
{
|
|
await RegisterSiteActorsAsync(cancellationToken);
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Returns the process-wide Akka <see cref="ActorSystem"/>, creating it on first call.
|
|
/// Idempotent and thread-safe: both <see cref="StartAsync"/> and the DI bridge that
|
|
/// exposes the system to the shared <c>ZB.MOM.WW.Health.Akka</c> checks call this, and
|
|
/// whichever runs first creates the system exactly once.
|
|
/// </summary>
|
|
/// <remarks>
|
|
/// HOST-021: the <see cref="ActorSystem"/> is an externally-owned process singleton — its
|
|
/// lifecycle is this hosted service's (created here, torn down via
|
|
/// <c>CoordinatedShutdown</c> in <see cref="StopAsync"/>). It MUST be registered in DI as a
|
|
/// <b>singleton resolved through this method</b>, never as a transient/scoped factory:
|
|
/// <see cref="ActorSystem"/> is <see cref="IDisposable"/>, and a transient/scoped factory
|
|
/// hands a fresh disposable to every resolving child scope (e.g. each per-probe
|
|
/// health-check scope), so the container disposes it when that scope ends —
|
|
/// <c>ActorSystem.Dispose()</c> runs <c>CoordinatedShutdown(ActorSystemTerminateReason)</c>
|
|
/// and tears the live cluster node down mid-flight, which is exactly the
|
|
/// "central report pages hang 30s" defect this method fixes. Creating the system here and
|
|
/// exposing it as a singleton keeps child-scope disposal away from it; routing the singleton
|
|
/// through this method (rather than a plain <c>AddSingleton(sp => ...ActorSystem)</c>
|
|
/// factory) also avoids caching a <c>null</c> if a health probe wins the startup race, since
|
|
/// the first resolve creates the system instead of capturing a not-yet-started reference.
|
|
/// </remarks>
|
|
/// <returns>The single live actor system.</returns>
|
|
public ActorSystem GetOrCreateActorSystem()
|
|
{
|
|
if (_actorSystem is not null)
|
|
{
|
|
return _actorSystem;
|
|
}
|
|
|
|
lock (_actorSystemLock)
|
|
{
|
|
if (_actorSystem is not null)
|
|
{
|
|
return _actorSystem;
|
|
}
|
|
|
|
// For site nodes, include a site-specific role (e.g., "site-SiteA") alongside the base role
|
|
var roles = BuildRoles();
|
|
|
|
// Host-006: HOCON is assembled in a dedicated builder that quotes/escapes every
|
|
// interpolated value, so a hostname, seed node or strategy containing a quote,
|
|
// backslash or whitespace cannot corrupt the configuration document.
|
|
var hocon = BuildHocon(_nodeOptions, _clusterOptions, roles,
|
|
_communicationOptions.TransportHeartbeatInterval,
|
|
_communicationOptions.TransportFailureThreshold);
|
|
|
|
var config = ConfigurationFactory.ParseString(hocon);
|
|
var system = ActorSystem.Create("scadabridge", config);
|
|
|
|
_logger.LogInformation(
|
|
"Akka.NET actor system 'scadabridge' started. Role={Role}, Roles={Roles}, Hostname={Hostname}, Port={Port}, " +
|
|
"TransportHeartbeat={TransportHeartbeat}s, TransportFailure={TransportFailure}s",
|
|
_nodeOptions.Role,
|
|
string.Join(", ", roles),
|
|
_nodeOptions.NodeHostname,
|
|
_nodeOptions.RemotingPort,
|
|
_communicationOptions.TransportHeartbeatInterval.TotalSeconds,
|
|
_communicationOptions.TransportFailureThreshold.TotalSeconds);
|
|
|
|
// Publish last so a concurrent reader never observes a half-constructed system.
|
|
_actorSystem = system;
|
|
return _actorSystem;
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Builds the Akka HOCON configuration document. Every interpolated value is
|
|
/// routed through <see cref="QuoteHocon"/> (string values) so a hostname,
|
|
/// seed-node URI, role or split-brain strategy containing a quote, backslash or
|
|
/// whitespace cannot corrupt the document or be silently misparsed (Host-006).
|
|
///
|
|
/// Host-012: the <c>keep-oldest down-if-alone</c> flag is emitted from
|
|
/// <see cref="ClusterOptions.DownIfAlone"/> rather than hard-coded, so the bound
|
|
/// configuration value is actually consumed.
|
|
///
|
|
/// Host-013: every duration is rendered via <see cref="DurationHocon"/> in
|
|
/// milliseconds, so sub-second cluster timing values (e.g. a 750ms heartbeat) are
|
|
/// preserved exactly instead of being rounded to whole seconds.
|
|
/// </summary>
|
|
/// <param name="nodeOptions">The node configuration options.</param>
|
|
/// <param name="clusterOptions">The cluster configuration options.</param>
|
|
/// <param name="roles">The list of node roles to configure.</param>
|
|
/// <param name="transportHeartbeat">The transport heartbeat interval.</param>
|
|
/// <param name="transportFailure">The transport failure threshold.</param>
|
|
/// <returns>The Akka HOCON configuration string.</returns>
|
|
public static string BuildHocon(
|
|
NodeOptions nodeOptions,
|
|
ClusterOptions clusterOptions,
|
|
IEnumerable<string> roles,
|
|
TimeSpan transportHeartbeat,
|
|
TimeSpan transportFailure)
|
|
{
|
|
var seedNodesStr = string.Join(",",
|
|
clusterOptions.SeedNodes.Select(QuoteHocon));
|
|
var rolesStr = string.Join(",", roles.Select(QuoteHocon));
|
|
|
|
return $@"
|
|
audit-telemetry-dispatcher {{
|
|
type = ForkJoinDispatcher
|
|
throughput = 100
|
|
dedicated-thread-pool {{
|
|
thread-count = 2
|
|
}}
|
|
}}
|
|
akka {{
|
|
extensions = [
|
|
""Akka.Cluster.Tools.PublishSubscribe.DistributedPubSubExtensionProvider, Akka.Cluster.Tools""
|
|
]
|
|
actor {{
|
|
provider = cluster
|
|
}}
|
|
remote {{
|
|
dot-netty.tcp {{
|
|
hostname = {QuoteHocon(nodeOptions.NodeHostname)}
|
|
port = {nodeOptions.RemotingPort}
|
|
}}
|
|
transport-failure-detector {{
|
|
heartbeat-interval = {DurationHocon(transportHeartbeat)}
|
|
acceptable-heartbeat-pause = {DurationHocon(transportFailure)}
|
|
}}
|
|
}}
|
|
cluster {{
|
|
seed-nodes = [{seedNodesStr}]
|
|
roles = [{rolesStr}]
|
|
min-nr-of-members = {clusterOptions.MinNrOfMembers}
|
|
split-brain-resolver {{
|
|
active-strategy = {QuoteHocon(clusterOptions.SplitBrainResolverStrategy)}
|
|
stable-after = {DurationHocon(clusterOptions.StableAfter)}
|
|
keep-oldest {{
|
|
down-if-alone = {(clusterOptions.DownIfAlone ? "on" : "off")}
|
|
}}
|
|
}}
|
|
failure-detector {{
|
|
heartbeat-interval = {DurationHocon(clusterOptions.HeartbeatInterval)}
|
|
acceptable-heartbeat-pause = {DurationHocon(clusterOptions.FailureDetectionThreshold)}
|
|
}}
|
|
run-coordinated-shutdown-when-down = on
|
|
}}
|
|
coordinated-shutdown {{
|
|
run-by-clr-shutdown-hook = on
|
|
}}
|
|
}}";
|
|
}
|
|
|
|
/// <summary>
|
|
/// Renders a <see cref="TimeSpan"/> as a HOCON duration in milliseconds. Akka's
|
|
/// HOCON parser accepts a <c>ms</c> suffix, so emitting whole milliseconds
|
|
/// preserves sub-second configuration exactly — a 750ms heartbeat stays 750ms
|
|
/// rather than being rounded to <c>1s</c> (or, for sub-half-second values,
|
|
/// silently collapsing to a degenerate <c>0s</c>) — Host-013.
|
|
/// </summary>
|
|
private static string DurationHocon(TimeSpan duration)
|
|
{
|
|
return $"{(long)Math.Round(duration.TotalMilliseconds)}ms";
|
|
}
|
|
|
|
/// <summary>
|
|
/// Renders a value as a HOCON double-quoted string, escaping backslashes and
|
|
/// double quotes so the resulting token cannot break out of its string literal.
|
|
/// </summary>
|
|
private static string QuoteHocon(string? value)
|
|
{
|
|
var escaped = (value ?? string.Empty)
|
|
.Replace("\\", "\\\\")
|
|
.Replace("\"", "\\\"");
|
|
return $"\"{escaped}\"";
|
|
}
|
|
|
|
/// <summary>
|
|
/// Stops the Akka.NET actor system and cleans up resources.
|
|
/// </summary>
|
|
/// <param name="cancellationToken">A cancellation token that can be used to cancel the operation.</param>
|
|
/// <returns>A task representing the asynchronous operation.</returns>
|
|
public async Task StopAsync(CancellationToken cancellationToken)
|
|
{
|
|
// Dispose auxiliary subscribers (e.g. SiteAuditTelemetryStalledTracker)
|
|
// BEFORE Akka shuts down so their EventStream unsubscribe calls run
|
|
// while the system is still alive. Per-tracker Dispose is wrapped in
|
|
// its own try so a misbehaving subscriber can't sink the shutdown.
|
|
// Snapshot the list inside a lock so a concurrent StartAsync (the
|
|
// test harness sometimes triggers a second start/stop interleaving)
|
|
// can't race the enumeration. Clearing the original list under the
|
|
// same lock leaves the next StartAsync with a clean slate.
|
|
IDisposable[] disposables;
|
|
lock (_trackedDisposables)
|
|
{
|
|
disposables = _trackedDisposables.ToArray();
|
|
_trackedDisposables.Clear();
|
|
}
|
|
foreach (var disposable in disposables)
|
|
{
|
|
try { disposable.Dispose(); }
|
|
catch (Exception ex)
|
|
{
|
|
_logger.LogWarning(ex,
|
|
"Auxiliary subscriber {Type} threw during shutdown",
|
|
disposable.GetType().Name);
|
|
}
|
|
}
|
|
|
|
if (_actorSystem != null)
|
|
{
|
|
_logger.LogInformation("Shutting down Akka.NET actor system via CoordinatedShutdown...");
|
|
var shutdown = Akka.Actor.CoordinatedShutdown.Get(_actorSystem);
|
|
await shutdown.Run(Akka.Actor.CoordinatedShutdown.ClrExitReason.Instance);
|
|
_logger.LogInformation("Akka.NET actor system shutdown complete.");
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Builds the list of cluster roles for this node. Site nodes get both "Site"
|
|
/// and a site-specific role (e.g., "site-SiteA") to scope singleton placement.
|
|
/// </summary>
|
|
private List<string> BuildRoles()
|
|
{
|
|
var roles = new List<string> { _nodeOptions.Role };
|
|
|
|
if (_nodeOptions.Role.Equals("Site", StringComparison.OrdinalIgnoreCase)
|
|
&& !string.IsNullOrEmpty(_nodeOptions.SiteId))
|
|
{
|
|
roles.Add($"site-{_nodeOptions.SiteId}");
|
|
}
|
|
|
|
return roles;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Registers central-side actors including the CentralCommunicationActor.
|
|
/// WP-4: Central communication actor routes all 8 message patterns to sites.
|
|
/// </summary>
|
|
private void RegisterCentralActors()
|
|
{
|
|
// Feed this central node's hostname into the local health collector so
|
|
// the CentralHealthReportLoop's report identifies the active node.
|
|
var centralHealthCollector = _serviceProvider.GetService<ZB.MOM.WW.ScadaBridge.HealthMonitoring.ISiteHealthCollector>();
|
|
centralHealthCollector?.SetNodeHostname(_nodeOptions.NodeHostname);
|
|
|
|
var siteClientFactory = new DefaultSiteClientFactory();
|
|
var centralCommActor = _actorSystem!.ActorOf(
|
|
Props.Create(() => new CentralCommunicationActor(_serviceProvider, siteClientFactory)),
|
|
"central-communication");
|
|
|
|
// Register CentralCommunicationActor with ClusterClientReceptionist so site ClusterClients can reach it
|
|
ClusterClientReceptionist.Get(_actorSystem).RegisterService(centralCommActor);
|
|
_logger.LogInformation("CentralCommunicationActor registered with ClusterClientReceptionist");
|
|
|
|
// Wire up the CommunicationService with the actor reference
|
|
var commService = _serviceProvider.GetService<CommunicationService>();
|
|
commService?.SetCommunicationActor(centralCommActor);
|
|
|
|
// Wire up the DebugStreamService with the ActorSystem
|
|
var debugStreamService = _serviceProvider.GetService<DebugStreamService>();
|
|
debugStreamService?.SetActorSystem(_actorSystem!);
|
|
|
|
|
|
// Management Service — accessible via ClusterClient
|
|
var mgmtLogger = _serviceProvider.GetRequiredService<ILoggerFactory>()
|
|
.CreateLogger<ZB.MOM.WW.ScadaBridge.ManagementService.ManagementActor>();
|
|
var mgmtActor = _actorSystem!.ActorOf(
|
|
Props.Create(() => new ZB.MOM.WW.ScadaBridge.ManagementService.ManagementActor(_serviceProvider, mgmtLogger)),
|
|
"management");
|
|
ClusterClientReceptionist.Get(_actorSystem).RegisterService(mgmtActor);
|
|
var mgmtHolder = _serviceProvider.GetRequiredService<ZB.MOM.WW.ScadaBridge.ManagementService.ManagementActorHolder>();
|
|
mgmtHolder.ActorRef = mgmtActor;
|
|
_logger.LogInformation("ManagementActor registered with ClusterClientReceptionist");
|
|
|
|
// Notification Outbox — cluster singleton so exactly one node owns ingest,
|
|
// the dispatch sweep and the purge loop. Central actors run on the base
|
|
// "Central" role, so the singleton settings are NOT role-scoped (unlike the
|
|
// site singletons, which are scoped to a per-site role).
|
|
var outboxOptions = _serviceProvider
|
|
.GetRequiredService<IOptions<ZB.MOM.WW.ScadaBridge.NotificationOutbox.NotificationOutboxOptions>>().Value;
|
|
var outboxLogger = _serviceProvider.GetRequiredService<ILoggerFactory>()
|
|
.CreateLogger<ZB.MOM.WW.ScadaBridge.NotificationOutbox.NotificationOutboxActor>();
|
|
// M4 Bundle B: central direct-write audit writer for dispatcher attempt
|
|
// + terminal events. Resolved once from the root provider — the writer
|
|
// is a singleton and stateless, opening per-call DI scopes internally
|
|
// to resolve the scoped IAuditLogRepository.
|
|
var outboxAuditWriter = _serviceProvider
|
|
.GetRequiredService<ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Services.ICentralAuditWriter>();
|
|
|
|
var outboxSingletonProps = ClusterSingletonManager.Props(
|
|
singletonProps: Props.Create(() => new ZB.MOM.WW.ScadaBridge.NotificationOutbox.NotificationOutboxActor(
|
|
_serviceProvider,
|
|
outboxOptions,
|
|
outboxAuditWriter,
|
|
outboxLogger)),
|
|
terminationMessage: PoisonPill.Instance,
|
|
settings: ClusterSingletonManagerSettings.Create(_actorSystem!)
|
|
.WithSingletonName("notification-outbox"));
|
|
_actorSystem!.ActorOf(outboxSingletonProps, "notification-outbox-singleton");
|
|
|
|
var outboxProxyProps = ClusterSingletonProxy.Props(
|
|
singletonManagerPath: "/user/notification-outbox-singleton",
|
|
settings: ClusterSingletonProxySettings.Create(_actorSystem)
|
|
.WithSingletonName("notification-outbox"));
|
|
var outboxProxy = _actorSystem.ActorOf(outboxProxyProps, "notification-outbox-proxy");
|
|
|
|
// Hand the outbox proxy to the CentralCommunicationActor so forwarded
|
|
// NotificationSubmit messages from sites are routed to the outbox singleton.
|
|
centralCommActor.Tell(new RegisterNotificationOutbox(outboxProxy));
|
|
|
|
// Hand the same proxy to the CommunicationService so the Central UI can
|
|
// Ask the outbox actor directly (query, retry, discard, KPIs).
|
|
commService?.SetNotificationOutbox(outboxProxy);
|
|
_logger.LogInformation("NotificationOutbox singleton created and registered with CentralCommunicationActor");
|
|
|
|
// Audit Log (#23) — central singleton mirrors the Notification Outbox
|
|
// pattern. The IngestAuditEvents gRPC handler lives on SiteStreamGrpcServer
|
|
// (Communication.Grpc); a central node hosting that server (M6 reconciliation
|
|
// path) hands the proxy in via SetAuditIngestActor below. When the gRPC
|
|
// server is not registered (current central topology), the host still
|
|
// brings the singleton up so a Bundle H in-process test (or a future
|
|
// direct caller) can Ask the proxy without further wiring.
|
|
// IAuditLogRepository is a SCOPED EF Core service, so the singleton
|
|
// actor takes the root IServiceProvider and creates a fresh scope per
|
|
// message (mirroring NotificationOutboxActor). Pre-resolving the
|
|
// repository here would attempt to take a scoped service from the
|
|
// root and fail under DI scope validation.
|
|
var auditIngestLogger = _serviceProvider.GetRequiredService<ILoggerFactory>()
|
|
.CreateLogger<ZB.MOM.WW.ScadaBridge.AuditLog.Central.AuditLogIngestActor>();
|
|
|
|
var auditIngestSingletonProps = ClusterSingletonManager.Props(
|
|
singletonProps: Props.Create(() => new ZB.MOM.WW.ScadaBridge.AuditLog.Central.AuditLogIngestActor(
|
|
_serviceProvider,
|
|
auditIngestLogger)),
|
|
terminationMessage: PoisonPill.Instance,
|
|
settings: ClusterSingletonManagerSettings.Create(_actorSystem!)
|
|
.WithSingletonName("audit-log-ingest"));
|
|
_actorSystem!.ActorOf(auditIngestSingletonProps, "audit-log-ingest-singleton");
|
|
|
|
var auditIngestProxyProps = ClusterSingletonProxy.Props(
|
|
singletonManagerPath: "/user/audit-log-ingest-singleton",
|
|
settings: ClusterSingletonProxySettings.Create(_actorSystem)
|
|
.WithSingletonName("audit-log-ingest"));
|
|
var auditIngestProxy = _actorSystem.ActorOf(auditIngestProxyProps, "audit-log-ingest-proxy");
|
|
|
|
// Hand the audit-ingest proxy to the CentralCommunicationActor so audit
|
|
// ingest commands forwarded by sites over ClusterClient are routed to the
|
|
// singleton. Mirrors the RegisterNotificationOutbox wiring above.
|
|
centralCommActor.Tell(new RegisterAuditIngest(auditIngestProxy));
|
|
|
|
// Hand the proxy to the SiteStreamGrpcServer (if registered on this node)
|
|
// so the IngestAuditEvents RPC routes incoming site batches to the singleton.
|
|
// The gRPC server is currently only registered on Site nodes; on a central
|
|
// node this resolves to null and the wiring is a no-op until M6 (which
|
|
// brings central-hosted gRPC + a real site→central client).
|
|
var grpcServer = _serviceProvider.GetService<ZB.MOM.WW.ScadaBridge.Communication.Grpc.SiteStreamGrpcServer>();
|
|
grpcServer?.SetAuditIngestActor(auditIngestProxy);
|
|
_logger.LogInformation(
|
|
"AuditLogIngestActor singleton created (gRPC server bound: {GrpcBound})",
|
|
grpcServer is not null);
|
|
|
|
// Audit Log (#23) M6 Bundle E (T7): subscribe the per-site stalled
|
|
// telemetry tracker to the actor system EventStream NOW that the
|
|
// system exists. The tracker mirrors every
|
|
// SiteAuditTelemetryStalledChanged publication (from
|
|
// SiteAuditReconciliationActor — wired in a later bundle) into the
|
|
// AuditCentralHealthSnapshot singleton so the central health surface
|
|
// sees per-site stalled state. The tracker is constructed here rather
|
|
// than in AddAuditLogCentralMaintenance because its ctor needs an
|
|
// ActorSystem, which is not a DI-resolvable singleton — it's owned
|
|
// by this hosted service. The snapshot singleton is resolvable;
|
|
// passing it in seeds the tracker's Apply() so both internal state
|
|
// and the snapshot stay in lock-step.
|
|
var auditCentralSnapshot = _serviceProvider
|
|
.GetService<ZB.MOM.WW.ScadaBridge.AuditLog.Central.AuditCentralHealthSnapshot>();
|
|
if (auditCentralSnapshot is not null)
|
|
{
|
|
var stalledTracker = new ZB.MOM.WW.ScadaBridge.AuditLog.Central.SiteAuditTelemetryStalledTracker(
|
|
_actorSystem!, auditCentralSnapshot);
|
|
lock (_trackedDisposables)
|
|
{
|
|
_trackedDisposables.Add(stalledTracker);
|
|
}
|
|
_logger.LogInformation("SiteAuditTelemetryStalledTracker subscribed to EventStream");
|
|
}
|
|
|
|
// Site Call Audit (#22) — central singleton mirrors the AuditLogIngest
|
|
// and NotificationOutbox patterns. M3's dual-write transaction routes
|
|
// SiteCalls upserts through AuditLogIngestActor's own scope-per-message
|
|
// ISiteCallAuditRepository resolution, so this singleton is not on the
|
|
// M3 happy-path hot path; it exists so direct-write callers Ask through
|
|
// a stable cluster proxy without further wiring. The central→site
|
|
// Retry/Discard relay now lives in this actor (see the
|
|
// RegisterCentralCommunication wiring below); the reconciliation puller
|
|
// is the remaining deferred direct-write caller.
|
|
// Like AuditLogIngestActor, the actor takes the root IServiceProvider
|
|
// and creates a fresh scope per message because ISiteCallAuditRepository
|
|
// is a scoped EF Core service.
|
|
var siteCallAuditLogger = _serviceProvider.GetRequiredService<ILoggerFactory>()
|
|
.CreateLogger<ZB.MOM.WW.ScadaBridge.SiteCallAudit.SiteCallAuditActor>();
|
|
var siteCallAuditOptions = _serviceProvider
|
|
.GetRequiredService<IOptions<ZB.MOM.WW.ScadaBridge.SiteCallAudit.SiteCallAuditOptions>>().Value;
|
|
|
|
var siteCallAuditSingletonProps = ClusterSingletonManager.Props(
|
|
singletonProps: Props.Create(() => new ZB.MOM.WW.ScadaBridge.SiteCallAudit.SiteCallAuditActor(
|
|
_serviceProvider,
|
|
siteCallAuditOptions,
|
|
siteCallAuditLogger)),
|
|
terminationMessage: PoisonPill.Instance,
|
|
settings: ClusterSingletonManagerSettings.Create(_actorSystem!)
|
|
.WithSingletonName("site-call-audit"));
|
|
var siteCallAuditSingletonManager =
|
|
_actorSystem!.ActorOf(siteCallAuditSingletonProps, "site-call-audit-singleton");
|
|
|
|
// SiteCallAudit-002 graceful-handover hook. The default singleton handover
|
|
// path waits for the actor's `ReceiveAsync` task to complete before
|
|
// signalling `HandOverDone` to the new oldest node — so an in-flight
|
|
// EF `UpsertAsync` IS waited for during a *clean* coordinated shutdown
|
|
// (the cluster-leave phase below fires before the singleton terminates).
|
|
// The risk the finding tracks is the seam between in-flight async work
|
|
// and the cluster-leave + singleton-stop sequence: we bound it by
|
|
// issuing an explicit `GracefulStop` to the singleton manager early
|
|
// in `cluster-leave`, with a timeout that lets the running upsert + SQL
|
|
// round-trip drain before the handover-to-other-node race window
|
|
// opens. The timeout is bounded so a misbehaving upsert cannot stall
|
|
// coordinated shutdown indefinitely — exceeding it falls through to
|
|
// the existing PoisonPill termination path. Same pattern is suitable
|
|
// for the NotificationOutbox singleton; not added here to keep this
|
|
// change minimal (out of NS-020's scope).
|
|
var siteCallAuditShutdown = Akka.Actor.CoordinatedShutdown.Get(_actorSystem);
|
|
siteCallAuditShutdown.AddTask(
|
|
Akka.Actor.CoordinatedShutdown.PhaseClusterLeave,
|
|
"drain-site-call-audit-singleton",
|
|
async () =>
|
|
{
|
|
try
|
|
{
|
|
await siteCallAuditSingletonManager.GracefulStop(TimeSpan.FromSeconds(10));
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
_logger.LogWarning(ex,
|
|
"SiteCallAudit singleton did not drain within the graceful-stop "
|
|
+ "timeout; falling through to PoisonPill handover");
|
|
}
|
|
return Akka.Done.Instance;
|
|
});
|
|
|
|
var siteCallAuditProxyProps = ClusterSingletonProxy.Props(
|
|
singletonManagerPath: "/user/site-call-audit-singleton",
|
|
settings: ClusterSingletonProxySettings.Create(_actorSystem)
|
|
.WithSingletonName("site-call-audit"));
|
|
var siteCallAuditProxy = _actorSystem.ActorOf(siteCallAuditProxyProps, "site-call-audit-proxy");
|
|
|
|
// Hand the proxy to the CommunicationService so the Central UI can Ask
|
|
// the Site Call Audit actor directly (query, KPIs, detail) — mirrors the
|
|
// SetNotificationOutbox wiring above.
|
|
commService?.SetSiteCallAudit(siteCallAuditProxy);
|
|
|
|
// Task 5 (#22): hand the CentralCommunicationActor to the SiteCallAudit
|
|
// actor so it can relay operator Retry/Discard on parked cached calls to
|
|
// the owning site (over the per-site ClusterClient via SiteEnvelope).
|
|
// Mirrors the RegisterAuditIngest / RegisterNotificationOutbox wiring;
|
|
// the message is sent to the singleton proxy so it reaches whichever
|
|
// central node currently hosts the singleton.
|
|
siteCallAuditProxy.Tell(
|
|
new ZB.MOM.WW.ScadaBridge.SiteCallAudit.RegisterCentralCommunication(centralCommActor));
|
|
_logger.LogInformation(
|
|
"SiteCallAuditActor singleton created and registered with CentralCommunicationActor");
|
|
|
|
// Audit Log (#23) M6 Bundle B/C — start the two central-only maintenance
|
|
// singletons that were fully implemented but never instantiated: the
|
|
// daily AuditLog partition-switch purge (AuditLogPurgeActor) and the
|
|
// periodic per-site audit-event reconciliation pull
|
|
// (SiteAuditReconciliationActor). Both mirror the SiteCallAudit /
|
|
// NotificationOutbox singleton pattern above: a ClusterSingletonManager
|
|
// pins the actor to the active central node, a ClusterSingletonProxy
|
|
// gives a stable address, and a PhaseClusterLeave graceful-stop task
|
|
// drains the in-flight tick before handover. Options + the production
|
|
// ISiteEnumerator + IPullAuditEventsClient come from
|
|
// AddAuditLogCentralReconciliationClient (central composition root only).
|
|
// Both actors take the root IServiceProvider and open their own per-tick
|
|
// DI scope because IAuditLogRepository / ISiteRepository are scoped EF
|
|
// Core services.
|
|
var auditPurgeLogger = _serviceProvider.GetRequiredService<ILoggerFactory>()
|
|
.CreateLogger<ZB.MOM.WW.ScadaBridge.AuditLog.Central.AuditLogPurgeActor>();
|
|
var auditPurgeOptions = _serviceProvider
|
|
.GetRequiredService<IOptions<ZB.MOM.WW.ScadaBridge.AuditLog.Central.AuditLogPurgeOptions>>();
|
|
var auditLogOptions = _serviceProvider
|
|
.GetRequiredService<IOptions<ZB.MOM.WW.ScadaBridge.AuditLog.Configuration.AuditLogOptions>>();
|
|
|
|
var auditPurgeSingletonProps = ClusterSingletonManager.Props(
|
|
singletonProps: Props.Create(() => new ZB.MOM.WW.ScadaBridge.AuditLog.Central.AuditLogPurgeActor(
|
|
_serviceProvider,
|
|
auditPurgeOptions,
|
|
auditLogOptions,
|
|
auditPurgeLogger)),
|
|
terminationMessage: PoisonPill.Instance,
|
|
settings: ClusterSingletonManagerSettings.Create(_actorSystem!)
|
|
.WithSingletonName("audit-log-purge"));
|
|
var auditPurgeSingletonManager =
|
|
_actorSystem!.ActorOf(auditPurgeSingletonProps, "audit-log-purge-singleton");
|
|
|
|
var auditPurgeShutdown = Akka.Actor.CoordinatedShutdown.Get(_actorSystem);
|
|
auditPurgeShutdown.AddTask(
|
|
Akka.Actor.CoordinatedShutdown.PhaseClusterLeave,
|
|
"drain-audit-log-purge-singleton",
|
|
async () =>
|
|
{
|
|
try
|
|
{
|
|
await auditPurgeSingletonManager.GracefulStop(TimeSpan.FromSeconds(10));
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
_logger.LogWarning(ex,
|
|
"AuditLogPurge singleton did not drain within the graceful-stop "
|
|
+ "timeout; falling through to PoisonPill handover");
|
|
}
|
|
return Akka.Done.Instance;
|
|
});
|
|
|
|
var auditPurgeProxyProps = ClusterSingletonProxy.Props(
|
|
singletonManagerPath: "/user/audit-log-purge-singleton",
|
|
settings: ClusterSingletonProxySettings.Create(_actorSystem)
|
|
.WithSingletonName("audit-log-purge"));
|
|
_actorSystem.ActorOf(auditPurgeProxyProps, "audit-log-purge-proxy");
|
|
_logger.LogInformation("AuditLogPurgeActor singleton created");
|
|
|
|
// SiteAuditReconciliationActor — self-healing fallback puller. Resolves
|
|
// its production ISiteEnumerator (config-DB Site projection) and
|
|
// IPullAuditEventsClient (gRPC) from the central reconciliation-client
|
|
// helper registered in Program.cs.
|
|
var auditReconLogger = _serviceProvider.GetRequiredService<ILoggerFactory>()
|
|
.CreateLogger<ZB.MOM.WW.ScadaBridge.AuditLog.Central.SiteAuditReconciliationActor>();
|
|
var auditReconOptions = _serviceProvider
|
|
.GetRequiredService<IOptions<ZB.MOM.WW.ScadaBridge.AuditLog.Central.SiteAuditReconciliationOptions>>();
|
|
var auditReconSites = _serviceProvider
|
|
.GetRequiredService<ZB.MOM.WW.ScadaBridge.AuditLog.Central.ISiteEnumerator>();
|
|
var auditReconClient = _serviceProvider
|
|
.GetRequiredService<ZB.MOM.WW.ScadaBridge.AuditLog.Central.IPullAuditEventsClient>();
|
|
|
|
var auditReconSingletonProps = ClusterSingletonManager.Props(
|
|
singletonProps: Props.Create(() => new ZB.MOM.WW.ScadaBridge.AuditLog.Central.SiteAuditReconciliationActor(
|
|
auditReconSites,
|
|
auditReconClient,
|
|
_serviceProvider,
|
|
auditReconOptions,
|
|
auditReconLogger)),
|
|
terminationMessage: PoisonPill.Instance,
|
|
settings: ClusterSingletonManagerSettings.Create(_actorSystem!)
|
|
.WithSingletonName("site-audit-reconciliation"));
|
|
var auditReconSingletonManager =
|
|
_actorSystem!.ActorOf(auditReconSingletonProps, "site-audit-reconciliation-singleton");
|
|
|
|
var auditReconShutdown = Akka.Actor.CoordinatedShutdown.Get(_actorSystem);
|
|
auditReconShutdown.AddTask(
|
|
Akka.Actor.CoordinatedShutdown.PhaseClusterLeave,
|
|
"drain-site-audit-reconciliation-singleton",
|
|
async () =>
|
|
{
|
|
try
|
|
{
|
|
await auditReconSingletonManager.GracefulStop(TimeSpan.FromSeconds(10));
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
_logger.LogWarning(ex,
|
|
"SiteAuditReconciliation singleton did not drain within the graceful-stop "
|
|
+ "timeout; falling through to PoisonPill handover");
|
|
}
|
|
return Akka.Done.Instance;
|
|
});
|
|
|
|
var auditReconProxyProps = ClusterSingletonProxy.Props(
|
|
singletonManagerPath: "/user/site-audit-reconciliation-singleton",
|
|
settings: ClusterSingletonProxySettings.Create(_actorSystem)
|
|
.WithSingletonName("site-audit-reconciliation"));
|
|
_actorSystem.ActorOf(auditReconProxyProps, "site-audit-reconciliation-proxy");
|
|
_logger.LogInformation("SiteAuditReconciliationActor singleton created");
|
|
|
|
// KPI History (#26, M6) — central singleton that periodically samples the
|
|
// Notification Outbox / Site Call Audit point-in-time KPIs into the
|
|
// KpiHistorySamples table and runs the daily retention purge. Mirrors the
|
|
// audit-log-purge singleton pattern above: a ClusterSingletonManager pins
|
|
// the recorder to the active central node, a ClusterSingletonProxy gives a
|
|
// stable address, and a PhaseClusterLeave graceful-stop task drains the
|
|
// in-flight tick before handover. The recorder's sample + purge timers
|
|
// self-schedule in PreStart. Options come from AddKpiHistory (central
|
|
// composition root only). The actor takes the root IServiceProvider and
|
|
// opens its own per-tick DI scope (the KPI repository is a scoped EF Core
|
|
// service), so the 3 ctor args (IServiceProvider, KpiHistoryOptions,
|
|
// ILogger) are resolved here from DI exactly like the other singletons.
|
|
// NOT readiness-gated by design: KPI history is observability/best-effort
|
|
// (it must never gate /health/ready), so kpi-history-recorder is
|
|
// deliberately absent from RequiredSingletonsHealthCheck.
|
|
var kpiHistoryOptions = _serviceProvider
|
|
.GetRequiredService<IOptions<ZB.MOM.WW.ScadaBridge.KpiHistory.KpiHistoryOptions>>().Value;
|
|
var kpiHistoryLogger = _serviceProvider.GetRequiredService<ILoggerFactory>()
|
|
.CreateLogger<ZB.MOM.WW.ScadaBridge.KpiHistory.KpiHistoryRecorderActor>();
|
|
|
|
var kpiHistorySingletonProps = ClusterSingletonManager.Props(
|
|
singletonProps: Props.Create(() => new ZB.MOM.WW.ScadaBridge.KpiHistory.KpiHistoryRecorderActor(
|
|
_serviceProvider,
|
|
kpiHistoryOptions,
|
|
kpiHistoryLogger)),
|
|
terminationMessage: PoisonPill.Instance,
|
|
settings: ClusterSingletonManagerSettings.Create(_actorSystem!)
|
|
.WithSingletonName("kpi-history-recorder"));
|
|
var kpiHistorySingletonManager =
|
|
_actorSystem!.ActorOf(kpiHistorySingletonProps, "kpi-history-recorder-singleton");
|
|
|
|
var kpiHistoryShutdown = Akka.Actor.CoordinatedShutdown.Get(_actorSystem);
|
|
kpiHistoryShutdown.AddTask(
|
|
Akka.Actor.CoordinatedShutdown.PhaseClusterLeave,
|
|
"drain-kpi-history-recorder-singleton",
|
|
async () =>
|
|
{
|
|
try
|
|
{
|
|
await kpiHistorySingletonManager.GracefulStop(TimeSpan.FromSeconds(10));
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
_logger.LogWarning(ex,
|
|
"KpiHistoryRecorder singleton did not drain within the graceful-stop "
|
|
+ "timeout; falling through to PoisonPill handover");
|
|
}
|
|
return Akka.Done.Instance;
|
|
});
|
|
|
|
var kpiHistoryProxyProps = ClusterSingletonProxy.Props(
|
|
singletonManagerPath: "/user/kpi-history-recorder-singleton",
|
|
settings: ClusterSingletonProxySettings.Create(_actorSystem)
|
|
.WithSingletonName("kpi-history-recorder"));
|
|
_actorSystem.ActorOf(kpiHistoryProxyProps, "kpi-history-recorder-proxy");
|
|
_logger.LogInformation("KpiHistoryRecorderActor singleton created (not readiness-gated)");
|
|
|
|
_logger.LogInformation("Central actors registered. CentralCommunicationActor created.");
|
|
}
|
|
|
|
/// <summary>
|
|
/// Registers site-specific actors including the Deployment Manager cluster singleton
|
|
/// and the SiteCommunicationActor.
|
|
/// The singleton is scoped to the site-specific cluster role so it runs on exactly
|
|
/// one node within this site's cluster.
|
|
/// </summary>
|
|
private async Task RegisterSiteActorsAsync(CancellationToken cancellationToken)
|
|
{
|
|
var siteRole = $"site-{_nodeOptions.SiteId}";
|
|
var storage = _serviceProvider.GetRequiredService<SiteStorageService>();
|
|
var compilationService = _serviceProvider.GetRequiredService<ScriptCompilationService>();
|
|
var sharedScriptLibrary = _serviceProvider.GetRequiredService<SharedScriptLibrary>();
|
|
var streamManager = _serviceProvider.GetRequiredService<SiteStreamManager>();
|
|
streamManager.Initialize(_actorSystem!);
|
|
var siteRuntimeOptionsValue = _serviceProvider.GetService<IOptions<SiteRuntimeOptions>>()?.Value
|
|
?? new SiteRuntimeOptions();
|
|
var dmLogger = _serviceProvider.GetRequiredService<ILoggerFactory>()
|
|
.CreateLogger<DeploymentManagerActor>();
|
|
|
|
// WP-34: Create DCL Manager Actor for tag subscriptions
|
|
var dclFactory = _serviceProvider.GetService<ZB.MOM.WW.ScadaBridge.DataConnectionLayer.IDataConnectionFactory>();
|
|
var dclOptions = _serviceProvider.GetService<IOptions<ZB.MOM.WW.ScadaBridge.DataConnectionLayer.DataConnectionOptions>>()?.Value
|
|
?? new ZB.MOM.WW.ScadaBridge.DataConnectionLayer.DataConnectionOptions();
|
|
IActorRef? dclManager = null;
|
|
if (dclFactory != null)
|
|
{
|
|
var healthCollector = _serviceProvider.GetRequiredService<ZB.MOM.WW.ScadaBridge.HealthMonitoring.ISiteHealthCollector>();
|
|
var siteEventLogger = _serviceProvider.GetService<ZB.MOM.WW.ScadaBridge.SiteEventLogging.ISiteEventLogger>();
|
|
// T17: the verify-endpoint probe builds an OPC UA ApplicationConfiguration directly,
|
|
// so the manager needs the same deployment-wide OpcUaGlobalOptions the
|
|
// DataConnectionFactory feeds to RealOpcUaClient when creating connections.
|
|
var opcUaGlobalOptions = _serviceProvider
|
|
.GetService<IOptions<ZB.MOM.WW.ScadaBridge.DataConnectionLayer.OpcUaGlobalOptions>>()?.Value
|
|
?? new ZB.MOM.WW.ScadaBridge.DataConnectionLayer.OpcUaGlobalOptions();
|
|
dclManager = _actorSystem!.ActorOf(
|
|
Props.Create(() => new ZB.MOM.WW.ScadaBridge.DataConnectionLayer.Actors.DataConnectionManagerActor(
|
|
dclFactory, dclOptions, healthCollector, siteEventLogger, opcUaGlobalOptions)),
|
|
"dcl-manager");
|
|
_logger.LogInformation("Data Connection Layer manager actor created");
|
|
}
|
|
|
|
// T17 / D6 — per-node OPC UA certificate-store actor. Created on EVERY
|
|
// site node (NOT a singleton) at a well-known name so the Deployment
|
|
// Manager singleton can fan a trust/remove out to BOTH nodes' PKI stores
|
|
// (node-a + node-b) and keep them in lock-step across failover. It needs
|
|
// the same deployment-wide OpcUaGlobalOptions the DCL manager uses so a
|
|
// trusted cert lands in the exact store RealOpcUaClient validates against.
|
|
var certStoreOpcUaOptions = _serviceProvider
|
|
.GetService<IOptions<ZB.MOM.WW.ScadaBridge.DataConnectionLayer.OpcUaGlobalOptions>>()?.Value
|
|
?? new ZB.MOM.WW.ScadaBridge.DataConnectionLayer.OpcUaGlobalOptions();
|
|
_actorSystem!.ActorOf(
|
|
Props.Create(() => new ZB.MOM.WW.ScadaBridge.SiteRuntime.Actors.CertStoreActor(certStoreOpcUaOptions)),
|
|
ZB.MOM.WW.ScadaBridge.SiteRuntime.Actors.CertStoreActor.WellKnownName);
|
|
_logger.LogInformation("Per-node CertStoreActor created at well-known name '{Name}' (T17/D6)",
|
|
ZB.MOM.WW.ScadaBridge.SiteRuntime.Actors.CertStoreActor.WellKnownName);
|
|
|
|
// Resolve the health collector for the Deployment Manager
|
|
var siteHealthCollector = _serviceProvider.GetService<ZB.MOM.WW.ScadaBridge.HealthMonitoring.ISiteHealthCollector>();
|
|
siteHealthCollector?.SetNodeHostname(_nodeOptions.NodeHostname);
|
|
|
|
// Notify-and-fetch: the deployment config fetcher pulls a deployment's flattened
|
|
// config from central over HTTP. Used by BOTH the active singleton
|
|
// (RefreshDeploymentCommand, Task 10) AND the standby replication path — the active
|
|
// node now replicates only the deployment id and the standby fetches the config
|
|
// itself, so a large config never crosses the intra-site Akka hop. Resolve once.
|
|
var deploymentConfigFetcher =
|
|
_serviceProvider.GetService<ZB.MOM.WW.ScadaBridge.SiteRuntime.Deployment.IDeploymentConfigFetcher>();
|
|
|
|
// Create SiteReplicationActor on every node (not a singleton)
|
|
var sfStorage = _serviceProvider.GetRequiredService<StoreAndForwardStorage>();
|
|
var replicationService = _serviceProvider.GetRequiredService<ReplicationService>();
|
|
var replicationLogger = _serviceProvider.GetRequiredService<ILoggerFactory>()
|
|
.CreateLogger<SiteReplicationActor>();
|
|
|
|
var replicationActor = _actorSystem!.ActorOf(
|
|
Props.Create(() => new SiteReplicationActor(
|
|
storage, sfStorage, replicationService, siteRole, replicationLogger,
|
|
deploymentConfigFetcher)),
|
|
"site-replication");
|
|
|
|
// Wire S&F replication handler to forward operations via the replication actor
|
|
replicationService.SetReplicationHandler(op =>
|
|
{
|
|
replicationActor.Tell(new ReplicateStoreAndForward(op));
|
|
return Task.CompletedTask;
|
|
});
|
|
|
|
_logger.LogInformation("SiteReplicationActor created and S&F replication handler wired");
|
|
|
|
// Create the Deployment Manager as a cluster singleton
|
|
var singletonProps = ClusterSingletonManager.Props(
|
|
singletonProps: Props.Create(() => new DeploymentManagerActor(
|
|
storage,
|
|
compilationService,
|
|
sharedScriptLibrary,
|
|
streamManager,
|
|
siteRuntimeOptionsValue,
|
|
dmLogger,
|
|
dclManager,
|
|
replicationActor,
|
|
siteHealthCollector,
|
|
_serviceProvider,
|
|
null,
|
|
deploymentConfigFetcher)),
|
|
terminationMessage: PoisonPill.Instance,
|
|
settings: ClusterSingletonManagerSettings.Create(_actorSystem!)
|
|
.WithRole(siteRole)
|
|
.WithSingletonName("deployment-manager"));
|
|
|
|
_actorSystem!.ActorOf(singletonProps, "deployment-manager-singleton");
|
|
|
|
// Create a proxy for other actors to communicate with the singleton
|
|
var proxyProps = ClusterSingletonProxy.Props(
|
|
singletonManagerPath: "/user/deployment-manager-singleton",
|
|
settings: ClusterSingletonProxySettings.Create(_actorSystem)
|
|
.WithRole(siteRole)
|
|
.WithSingletonName("deployment-manager"));
|
|
|
|
var dmProxy = _actorSystem.ActorOf(proxyProps, "deployment-manager-proxy");
|
|
|
|
// WP-4: Create SiteCommunicationActor for receiving messages from central
|
|
var siteCommActor = _actorSystem.ActorOf(
|
|
Props.Create(() => new SiteCommunicationActor(
|
|
_nodeOptions.SiteId!,
|
|
_communicationOptions,
|
|
dmProxy)),
|
|
"site-communication");
|
|
|
|
// Register local handlers with SiteCommunicationActor
|
|
siteCommActor.Tell(new RegisterLocalHandler(LocalHandlerType.Artifacts, dmProxy));
|
|
|
|
// Event log handler — cluster singleton so queries always reach the
|
|
// active node. The event log is node-local SQLite and is not
|
|
// replicated; only the active node records events. A per-node handler
|
|
// would let a ClusterClient query land on the standby and find nothing.
|
|
var eventLogQueryService = _serviceProvider.GetService<SiteEventLogging.IEventLogQueryService>();
|
|
if (eventLogQueryService != null)
|
|
{
|
|
var eventLogSingletonProps = ClusterSingletonManager.Props(
|
|
singletonProps: Props.Create(() => new SiteEventLogging.EventLogHandlerActor(eventLogQueryService)),
|
|
terminationMessage: PoisonPill.Instance,
|
|
settings: ClusterSingletonManagerSettings.Create(_actorSystem)
|
|
.WithRole(siteRole)
|
|
.WithSingletonName("event-log-handler"));
|
|
_actorSystem.ActorOf(eventLogSingletonProps, "event-log-handler-singleton");
|
|
|
|
var eventLogProxyProps = ClusterSingletonProxy.Props(
|
|
singletonManagerPath: "/user/event-log-handler-singleton",
|
|
settings: ClusterSingletonProxySettings.Create(_actorSystem)
|
|
.WithRole(siteRole)
|
|
.WithSingletonName("event-log-handler"));
|
|
var eventLogProxy = _actorSystem.ActorOf(eventLogProxyProps, "event-log-handler-proxy");
|
|
|
|
siteCommActor.Tell(new RegisterLocalHandler(LocalHandlerType.EventLog, eventLogProxy));
|
|
}
|
|
|
|
// Parked message handler — bridges Akka to StoreAndForwardService
|
|
var storeAndForwardService = _serviceProvider.GetService<StoreAndForwardService>();
|
|
if (storeAndForwardService != null)
|
|
{
|
|
// Initialize SQLite schema and start the retry timer. Must complete before
|
|
// any actor or HTTP handler touches the service. Host-005: awaited rather
|
|
// than blocked via GetAwaiter().GetResult() — no thread-pool starvation /
|
|
// sync-context deadlock risk, and exceptions surface as their original type.
|
|
cancellationToken.ThrowIfCancellationRequested();
|
|
await storeAndForwardService.StartAsync();
|
|
|
|
// Register the store-and-forward delivery handlers so buffered
|
|
// ExternalSystem calls, cached DB writes and notifications are actually
|
|
// delivered by the retry sweep. Without this, every buffered message is
|
|
// persisted but never delivered. Each handler resolves its scoped consumer
|
|
// service in a fresh DI scope — the sweep runs on a timer thread, outside
|
|
// any request scope.
|
|
storeAndForwardService.RegisterDeliveryHandler(
|
|
ZB.MOM.WW.ScadaBridge.Commons.Types.Enums.StoreAndForwardCategory.ExternalSystem,
|
|
async msg =>
|
|
{
|
|
using var scope = _serviceProvider.CreateScope();
|
|
return await scope.ServiceProvider
|
|
.GetRequiredService<ZB.MOM.WW.ScadaBridge.ExternalSystemGateway.ExternalSystemClient>()
|
|
.DeliverBufferedAsync(msg);
|
|
});
|
|
storeAndForwardService.RegisterDeliveryHandler(
|
|
ZB.MOM.WW.ScadaBridge.Commons.Types.Enums.StoreAndForwardCategory.CachedDbWrite,
|
|
async msg =>
|
|
{
|
|
using var scope = _serviceProvider.CreateScope();
|
|
return await scope.ServiceProvider
|
|
.GetRequiredService<ZB.MOM.WW.ScadaBridge.ExternalSystemGateway.DatabaseGateway>()
|
|
.DeliverBufferedAsync(msg);
|
|
});
|
|
// Notification Outbox: a buffered notification is no longer delivered by
|
|
// the site over SMTP. "Delivering" it means forwarding it to the central
|
|
// cluster via the SiteCommunicationActor and treating central's
|
|
// NotificationSubmitAck as the outcome (accepted → delivered; not accepted
|
|
// or timeout → throw → transient → keep buffering). Central owns SMTP.
|
|
//
|
|
// NotificationService-020: register exactly once. The sentinel guard
|
|
// catches a second registration path that re-introduces the dead
|
|
// NS-001 site-SMTP handler — see the sentinel's XML doc above for the
|
|
// historical context. Throwing here is intentional: a silent overwrite
|
|
// by a future maintainer would invert the design back to site-side
|
|
// delivery (NotificationForwarder vs. NotificationDeliveryService).
|
|
if (_notificationDeliveryHandlerRegistered)
|
|
{
|
|
throw new InvalidOperationException(
|
|
"NotificationService-020: A Notification-category store-and-forward "
|
|
+ "delivery handler was already registered. The canonical handler is "
|
|
+ "NotificationForwarder (central-only delivery, post-redesign). "
|
|
+ "If you are re-introducing a second registration path, remove the "
|
|
+ "first one — RegisterDeliveryHandler is last-write-wins per category "
|
|
+ "and a duplicate inverts the design.");
|
|
}
|
|
var notificationForwarder = new ZB.MOM.WW.ScadaBridge.StoreAndForward.NotificationForwarder(
|
|
siteCommActor,
|
|
_nodeOptions.SiteId!,
|
|
_communicationOptions.NotificationForwardTimeout);
|
|
storeAndForwardService.RegisterDeliveryHandler(
|
|
ZB.MOM.WW.ScadaBridge.Commons.Types.Enums.StoreAndForwardCategory.Notification,
|
|
notificationForwarder.DeliverAsync);
|
|
_notificationDeliveryHandlerRegistered = true;
|
|
_logger.LogInformation(
|
|
"Store-and-forward delivery handlers registered (ExternalSystem, CachedDbWrite, Notification)");
|
|
|
|
var parkedMessageHandler = _actorSystem.ActorOf(
|
|
Props.Create(() => new ParkedMessageHandlerActor(
|
|
storeAndForwardService, _nodeOptions.SiteId!)),
|
|
"parked-message-handler");
|
|
siteCommActor.Tell(new RegisterLocalHandler(LocalHandlerType.ParkedMessages, parkedMessageHandler));
|
|
}
|
|
|
|
// Register SiteCommunicationActor with ClusterClientReceptionist so central ClusterClients can reach it
|
|
ClusterClientReceptionist.Get(_actorSystem).RegisterService(siteCommActor);
|
|
|
|
_logger.LogInformation(
|
|
"Site actors registered. DeploymentManager singleton scoped to role={SiteRole}, SiteCommunicationActor created.",
|
|
siteRole);
|
|
|
|
// Create ClusterClient to central if contact points are configured
|
|
if (_communicationOptions.CentralContactPoints.Count > 0)
|
|
{
|
|
var contacts = _communicationOptions.CentralContactPoints
|
|
.Select(cp => ActorPath.Parse($"{cp}/system/receptionist"))
|
|
.ToImmutableHashSet();
|
|
var clientSettings = ClusterClientSettings.Create(_actorSystem)
|
|
.WithInitialContacts(contacts);
|
|
var centralClient = _actorSystem.ActorOf(
|
|
ClusterClient.Props(clientSettings), "central-cluster-client");
|
|
|
|
var siteCommSelection = _actorSystem.ActorSelection("/user/site-communication");
|
|
siteCommSelection.Tell(new RegisterCentralClient(centralClient));
|
|
|
|
_logger.LogInformation(
|
|
"Created ClusterClient to central with {Count} contact point(s) for site {SiteId}",
|
|
contacts.Count, _nodeOptions.SiteId);
|
|
}
|
|
|
|
// Task 18c — per-node startup reconciliation. Created on EVERY site node (NOT a
|
|
// singleton) so a standby that was DOWN during a deploy self-heals on its next
|
|
// restart: it reports its local deployed inventory to central via the
|
|
// SiteCommunicationActor Ask, fetches the gap (missing/stale) over HTTP, and
|
|
// guarded-writes it (orphans are logged, never deleted). Requires the HTTP
|
|
// config fetcher; if it is somehow absent the self-heal is skipped (best-effort —
|
|
// replication remains the primary path and the next restart retries).
|
|
if (deploymentConfigFetcher != null)
|
|
{
|
|
var reconcileLogger = _serviceProvider.GetRequiredService<ILoggerFactory>()
|
|
.CreateLogger<SiteReconciliationActor>();
|
|
_actorSystem.ActorOf(
|
|
Props.Create(() => new SiteReconciliationActor(
|
|
storage,
|
|
deploymentConfigFetcher,
|
|
siteCommActor,
|
|
_nodeOptions.SiteId!,
|
|
_nodeOptions.NodeName,
|
|
reconcileLogger,
|
|
null,
|
|
null)),
|
|
"site-reconciliation");
|
|
_logger.LogInformation(
|
|
"SiteReconciliationActor created (per-node startup self-heal) for site {SiteId} node {Node}",
|
|
_nodeOptions.SiteId, _nodeOptions.NodeName);
|
|
}
|
|
else
|
|
{
|
|
_logger.LogWarning(
|
|
"No IDeploymentConfigFetcher available; SiteReconciliationActor not created — "
|
|
+ "startup self-heal disabled (replication remains the primary path)");
|
|
}
|
|
|
|
// Audit Log (#23) — site-side telemetry actor that drains the SQLite
|
|
// Pending queue and pushes to central via IngestAuditEvents. Not a
|
|
// cluster singleton: each site is its own cluster, and the actor reads
|
|
// node-local SQLite (no replication). The Props are bound to the
|
|
// dedicated audit-telemetry-dispatcher (defined in BuildHocon) so a
|
|
// batch SQLite read + gRPC push never contend with the default
|
|
// dispatcher used by hot-path actors.
|
|
//
|
|
// Per Bundle E's brief: the SiteAuditTelemetryActor takes its
|
|
// collaborators through its constructor, so we resolve them from DI
|
|
// and pass them in via Props.Create rather than relying on a future
|
|
// FactoryProvider. The real site→central client is constructed and
|
|
// wired immediately below: a ClusterClientSiteAuditClient (ClusterClient
|
|
// transport, not gRPC) replaces the DI-default NoOpSiteStreamAuditClient
|
|
// for site roles, without disturbing the rest of this wiring.
|
|
var siteAuditOptions = _serviceProvider
|
|
.GetRequiredService<IOptions<ZB.MOM.WW.ScadaBridge.AuditLog.Site.Telemetry.SiteAuditTelemetryOptions>>();
|
|
var siteAuditQueue = _serviceProvider
|
|
.GetRequiredService<ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Services.ISiteAuditQueue>();
|
|
// Audit Log (#23) Task 2 follow-up: the production site→central audit
|
|
// push uses the ClusterClient transport via the SiteCommunicationActor,
|
|
// not the DI-resolved NoOpSiteStreamAuditClient. The NoOp default stays
|
|
// correct for central/test composition roots (no SiteCommunicationActor);
|
|
// a site role wires the real ClusterClient-based client here so the
|
|
// SQLite Pending backlog actually drains to central. The forward Ask
|
|
// reuses NotificationForwardTimeout — the same site→central command
|
|
// forward bound notifications already use over this transport.
|
|
ZB.MOM.WW.ScadaBridge.AuditLog.Site.Telemetry.ISiteStreamAuditClient siteAuditClient =
|
|
new ZB.MOM.WW.ScadaBridge.AuditLog.Site.Telemetry.ClusterClientSiteAuditClient(
|
|
siteCommActor,
|
|
_communicationOptions.NotificationForwardTimeout);
|
|
var siteAuditLogger = _serviceProvider.GetRequiredService<ILoggerFactory>()
|
|
.CreateLogger<ZB.MOM.WW.ScadaBridge.AuditLog.Site.Telemetry.SiteAuditTelemetryActor>();
|
|
|
|
// AuditLog-001: resolve the site-local operation tracking store so the
|
|
// actor can run the combined-telemetry cached-drain in parallel with
|
|
// the audit-only drain. The store is registered by AddSiteRuntime on
|
|
// site composition roots; on central it is intentionally absent and
|
|
// the cached-drain scheduler is never armed (the central side has no
|
|
// outbound cached calls to track). GetService — null when not
|
|
// registered — matches the optional-param contract on the actor ctor.
|
|
var siteTrackingStore = _serviceProvider
|
|
.GetService<ZB.MOM.WW.ScadaBridge.Commons.Interfaces.IOperationTrackingStore>();
|
|
|
|
var siteAuditTelemetryProps = Props.Create(() =>
|
|
new ZB.MOM.WW.ScadaBridge.AuditLog.Site.Telemetry.SiteAuditTelemetryActor(
|
|
siteAuditQueue,
|
|
siteAuditClient,
|
|
siteAuditOptions,
|
|
siteAuditLogger,
|
|
siteTrackingStore))
|
|
.WithDispatcher("audit-telemetry-dispatcher");
|
|
_actorSystem.ActorOf(siteAuditTelemetryProps, "site-audit-telemetry");
|
|
_logger.LogInformation(
|
|
"SiteAuditTelemetryActor created (dispatcher=audit-telemetry-dispatcher, client={ClientType}, cachedDrain={CachedDrainEnabled})",
|
|
siteAuditClient.GetType().Name,
|
|
siteTrackingStore is not null);
|
|
|
|
// Gate gRPC subscriptions until the actor system and SiteStreamManager are
|
|
// initialized (REQ-HOST-7).
|
|
//
|
|
// Host-009: SetReady asserts a deliberately narrow contract. By this point the
|
|
// actor system exists, SiteStreamManager.Initialize has run, and every
|
|
// role actor (SiteCommunicationActor, deployment-manager singleton,
|
|
// SiteReplicationActor, the ClusterClient) has been created with ActorOf —
|
|
// creation and the registration Tells are synchronous and strictly ordered.
|
|
// What is NOT guaranteed is completion of each actor's PreStart or the
|
|
// ClusterClient's initial-contact handshake with central: those are
|
|
// intentionally asynchronous. Gating readiness on the central handshake would
|
|
// be wrong — a site must come up and stream locally even while central is
|
|
// briefly unreachable. gRPC readiness therefore guarantees "the site actor
|
|
// graph exists and can accept subscription streams", not "the cluster
|
|
// handshake has completed". Streams opened before SetReady are already
|
|
// rejected by SiteStreamGrpcServer with StatusCode.Unavailable.
|
|
var grpcServer = _serviceProvider.GetService<ZB.MOM.WW.ScadaBridge.Communication.Grpc.SiteStreamGrpcServer>();
|
|
// Audit Log (#23 M6): hand the site-local SqliteAuditWriter (which
|
|
// implements ISiteAuditQueue) to the gRPC server so the PullAuditEvents
|
|
// reconciliation RPC can serve central's pulls. Both the writer and the
|
|
// gRPC server are singletons — wiring this here keeps the dependency
|
|
// direction one-way (Host knows both; Communication doesn't reach back
|
|
// into AuditLog).
|
|
grpcServer?.SetSiteAuditQueue(siteAuditQueue);
|
|
// Site Call Audit (#22): hand the site-local OperationTrackingStore to
|
|
// the gRPC server so the PullSiteCalls reconciliation RPC can serve
|
|
// central's self-heal pulls. siteTrackingStore is resolved above with
|
|
// GetService — present on site composition roots, null on central — so
|
|
// wire the seam only when the store exists. Like SetSiteAuditQueue, both
|
|
// the store and the gRPC server are singletons; wiring here keeps the
|
|
// dependency direction one-way (Host knows both; Communication doesn't
|
|
// reach back into SiteRuntime).
|
|
if (siteTrackingStore is not null)
|
|
{
|
|
grpcServer?.SetOperationTrackingStore(siteTrackingStore);
|
|
}
|
|
grpcServer?.SetReady(_actorSystem!);
|
|
}
|
|
}
|