refactor: rename ScadaLink → ZB.MOM.WW.ScadaBridge (code + projects + namespaces)
Solution + 23 src projects + 26 test projects renamed; folders, csproj, namespaces, and ScadaLinkDbContext/ScadaBridgeDbContext class updated. ActorSystem "scadalink" → "scadabridge", Akka seed-node URLs migrated. SQL roles/logins, LDAP domains, CLI command name, and CLI config dir (~/.scadalink → ~/.scadabridge) also renamed. Build green; 5 Host.Tests fail awaiting SQL login rename in next commit. Pre-existing StaleTagMonitor timing flakes unchanged. Rename script committed at tools/rename-to-scadabridge.sh.
This commit is contained in:
@@ -0,0 +1,849 @@
|
||||
using System.Collections.Immutable;
|
||||
using Akka.Actor;
|
||||
using Akka.Cluster;
|
||||
using Akka.Cluster.Tools.Client;
|
||||
using Akka.Cluster.Tools.Singleton;
|
||||
using Akka.Configuration;
|
||||
using Microsoft.Extensions.Options;
|
||||
using ZB.MOM.WW.ScadaBridge.ClusterInfrastructure;
|
||||
using ZB.MOM.WW.ScadaBridge.Communication;
|
||||
using ZB.MOM.WW.ScadaBridge.Communication.Actors;
|
||||
using ZB.MOM.WW.ScadaBridge.Host.Actors;
|
||||
using ZB.MOM.WW.ScadaBridge.SiteRuntime;
|
||||
using ZB.MOM.WW.ScadaBridge.SiteRuntime.Actors;
|
||||
using ZB.MOM.WW.ScadaBridge.SiteRuntime.Messages;
|
||||
using ZB.MOM.WW.ScadaBridge.SiteRuntime.Persistence;
|
||||
using ZB.MOM.WW.ScadaBridge.SiteRuntime.Scripts;
|
||||
using ZB.MOM.WW.ScadaBridge.SiteRuntime.Streaming;
|
||||
using ZB.MOM.WW.ScadaBridge.StoreAndForward;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.Host.Actors;
|
||||
|
||||
/// <summary>
|
||||
/// Hosted service that manages the Akka.NET actor system lifecycle.
|
||||
/// Creates the actor system on start, registers actors, and triggers
|
||||
/// CoordinatedShutdown on stop.
|
||||
///
|
||||
/// WP-3: Transport heartbeat is explicitly configured in HOCON from CommunicationOptions.
|
||||
/// </summary>
|
||||
public class AkkaHostedService : IHostedService
|
||||
{
|
||||
private readonly IServiceProvider _serviceProvider;
|
||||
private readonly NodeOptions _nodeOptions;
|
||||
private readonly ClusterOptions _clusterOptions;
|
||||
private readonly CommunicationOptions _communicationOptions;
|
||||
private readonly ILogger<AkkaHostedService> _logger;
|
||||
private ActorSystem? _actorSystem;
|
||||
/// <summary>
|
||||
/// Auxiliary IDisposables (e.g. the SiteAuditTelemetryStalledTracker)
|
||||
/// that this hosted service constructs at start time and must tear down
|
||||
/// on shutdown — they don't fit the ActorSystem lifecycle but share its
|
||||
/// process scope.
|
||||
/// </summary>
|
||||
private readonly List<IDisposable> _trackedDisposables = new();
|
||||
|
||||
/// <summary>
|
||||
/// NotificationService-020 guard: sentinel that flips to <c>true</c> the
|
||||
/// first time a Notification-category S&F delivery handler is registered
|
||||
/// on this hosted service instance. <see cref="StoreAndForwardService.RegisterDeliveryHandler"/>
|
||||
/// is last-write-wins on category, so a future code change that introduces
|
||||
/// a second registration path (e.g. a role-branch + helper that both call
|
||||
/// the registration) would silently overwrite the canonical
|
||||
/// <c>NotificationForwarder</c> handler with whatever the loser registers —
|
||||
/// the prior NS-001 fix did exactly this, and was silently superseded
|
||||
/// when the central-only redesign moved delivery to <c>NotificationOutbox</c>.
|
||||
/// This sentinel makes the duplicate noisy at startup so a maintainer
|
||||
/// re-introducing the second path sees it immediately.
|
||||
/// </summary>
|
||||
private bool _notificationDeliveryHandlerRegistered;
|
||||
|
||||
/// <summary>
|
||||
/// Initializes a new instance of the <see cref="AkkaHostedService"/> class.
|
||||
/// </summary>
|
||||
/// <param name="serviceProvider">The service provider for accessing dependencies.</param>
|
||||
/// <param name="nodeOptions">The node configuration options.</param>
|
||||
/// <param name="clusterOptions">The cluster configuration options.</param>
|
||||
/// <param name="communicationOptions">The communication configuration options.</param>
|
||||
/// <param name="logger">The logger instance.</param>
|
||||
public AkkaHostedService(
|
||||
IServiceProvider serviceProvider,
|
||||
IOptions<NodeOptions> nodeOptions,
|
||||
IOptions<ClusterOptions> clusterOptions,
|
||||
IOptions<CommunicationOptions> communicationOptions,
|
||||
ILogger<AkkaHostedService> logger)
|
||||
{
|
||||
_serviceProvider = serviceProvider;
|
||||
_nodeOptions = nodeOptions.Value;
|
||||
_clusterOptions = clusterOptions.Value;
|
||||
_communicationOptions = communicationOptions.Value;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets the actor system once started. Null before StartAsync completes.
|
||||
/// </summary>
|
||||
public ActorSystem? ActorSystem => _actorSystem;
|
||||
|
||||
/// <summary>
|
||||
/// Starts the Akka.NET actor system and registers actors.
|
||||
/// </summary>
|
||||
/// <param name="cancellationToken">A cancellation token that can be used to cancel the operation.</param>
|
||||
/// <returns>A task representing the asynchronous operation.</returns>
|
||||
public async Task StartAsync(CancellationToken cancellationToken)
|
||||
{
|
||||
// For site nodes, include a site-specific role (e.g., "site-SiteA") alongside the base role
|
||||
var roles = BuildRoles();
|
||||
|
||||
// WP-3: Transport heartbeat explicitly configured from CommunicationOptions (not framework defaults)
|
||||
var transportHeartbeatSec = _communicationOptions.TransportHeartbeatInterval.TotalSeconds;
|
||||
var transportFailureSec = _communicationOptions.TransportFailureThreshold.TotalSeconds;
|
||||
|
||||
// Host-006: HOCON is assembled in a dedicated builder that quotes/escapes every
|
||||
// interpolated value, so a hostname, seed node or strategy containing a quote,
|
||||
// backslash or whitespace cannot corrupt the configuration document.
|
||||
var hocon = BuildHocon(_nodeOptions, _clusterOptions, roles,
|
||||
_communicationOptions.TransportHeartbeatInterval,
|
||||
_communicationOptions.TransportFailureThreshold);
|
||||
|
||||
var config = ConfigurationFactory.ParseString(hocon);
|
||||
_actorSystem = ActorSystem.Create("scadabridge", config);
|
||||
|
||||
_logger.LogInformation(
|
||||
"Akka.NET actor system 'scadabridge' started. Role={Role}, Roles={Roles}, Hostname={Hostname}, Port={Port}, " +
|
||||
"TransportHeartbeat={TransportHeartbeat}s, TransportFailure={TransportFailure}s",
|
||||
_nodeOptions.Role,
|
||||
string.Join(", ", roles),
|
||||
_nodeOptions.NodeHostname,
|
||||
_nodeOptions.RemotingPort,
|
||||
transportHeartbeatSec,
|
||||
transportFailureSec);
|
||||
|
||||
// Register the dead letter monitor actor
|
||||
var loggerFactory = _serviceProvider.GetRequiredService<ILoggerFactory>();
|
||||
var dlmLogger = loggerFactory.CreateLogger<DeadLetterMonitorActor>();
|
||||
var dlmHealthCollector = _serviceProvider.GetService<ZB.MOM.WW.ScadaBridge.HealthMonitoring.ISiteHealthCollector>();
|
||||
_actorSystem.ActorOf(
|
||||
Props.Create(() => new DeadLetterMonitorActor(dlmLogger, dlmHealthCollector)),
|
||||
"dead-letter-monitor");
|
||||
|
||||
// Register role-specific actors
|
||||
if (_nodeOptions.Role.Equals("Central", StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
RegisterCentralActors();
|
||||
}
|
||||
else if (_nodeOptions.Role.Equals("Site", StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
await RegisterSiteActorsAsync(cancellationToken);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Builds the Akka HOCON configuration document. Every interpolated value is
|
||||
/// routed through <see cref="QuoteHocon"/> (string values) so a hostname,
|
||||
/// seed-node URI, role or split-brain strategy containing a quote, backslash or
|
||||
/// whitespace cannot corrupt the document or be silently misparsed (Host-006).
|
||||
///
|
||||
/// Host-012: the <c>keep-oldest down-if-alone</c> flag is emitted from
|
||||
/// <see cref="ClusterOptions.DownIfAlone"/> rather than hard-coded, so the bound
|
||||
/// configuration value is actually consumed.
|
||||
///
|
||||
/// Host-013: every duration is rendered via <see cref="DurationHocon"/> in
|
||||
/// milliseconds, so sub-second cluster timing values (e.g. a 750ms heartbeat) are
|
||||
/// preserved exactly instead of being rounded to whole seconds.
|
||||
/// </summary>
|
||||
/// <param name="nodeOptions">The node configuration options.</param>
|
||||
/// <param name="clusterOptions">The cluster configuration options.</param>
|
||||
/// <param name="roles">The list of node roles to configure.</param>
|
||||
/// <param name="transportHeartbeat">The transport heartbeat interval.</param>
|
||||
/// <param name="transportFailure">The transport failure threshold.</param>
|
||||
/// <returns>The Akka HOCON configuration string.</returns>
|
||||
public static string BuildHocon(
|
||||
NodeOptions nodeOptions,
|
||||
ClusterOptions clusterOptions,
|
||||
IEnumerable<string> roles,
|
||||
TimeSpan transportHeartbeat,
|
||||
TimeSpan transportFailure)
|
||||
{
|
||||
var seedNodesStr = string.Join(",",
|
||||
clusterOptions.SeedNodes.Select(QuoteHocon));
|
||||
var rolesStr = string.Join(",", roles.Select(QuoteHocon));
|
||||
|
||||
return $@"
|
||||
audit-telemetry-dispatcher {{
|
||||
type = ForkJoinDispatcher
|
||||
throughput = 100
|
||||
dedicated-thread-pool {{
|
||||
thread-count = 2
|
||||
}}
|
||||
}}
|
||||
akka {{
|
||||
extensions = [
|
||||
""Akka.Cluster.Tools.PublishSubscribe.DistributedPubSubExtensionProvider, Akka.Cluster.Tools""
|
||||
]
|
||||
actor {{
|
||||
provider = cluster
|
||||
}}
|
||||
remote {{
|
||||
dot-netty.tcp {{
|
||||
hostname = {QuoteHocon(nodeOptions.NodeHostname)}
|
||||
port = {nodeOptions.RemotingPort}
|
||||
}}
|
||||
transport-failure-detector {{
|
||||
heartbeat-interval = {DurationHocon(transportHeartbeat)}
|
||||
acceptable-heartbeat-pause = {DurationHocon(transportFailure)}
|
||||
}}
|
||||
}}
|
||||
cluster {{
|
||||
seed-nodes = [{seedNodesStr}]
|
||||
roles = [{rolesStr}]
|
||||
min-nr-of-members = {clusterOptions.MinNrOfMembers}
|
||||
split-brain-resolver {{
|
||||
active-strategy = {QuoteHocon(clusterOptions.SplitBrainResolverStrategy)}
|
||||
stable-after = {DurationHocon(clusterOptions.StableAfter)}
|
||||
keep-oldest {{
|
||||
down-if-alone = {(clusterOptions.DownIfAlone ? "on" : "off")}
|
||||
}}
|
||||
}}
|
||||
failure-detector {{
|
||||
heartbeat-interval = {DurationHocon(clusterOptions.HeartbeatInterval)}
|
||||
acceptable-heartbeat-pause = {DurationHocon(clusterOptions.FailureDetectionThreshold)}
|
||||
}}
|
||||
run-coordinated-shutdown-when-down = on
|
||||
}}
|
||||
coordinated-shutdown {{
|
||||
run-by-clr-shutdown-hook = on
|
||||
}}
|
||||
}}";
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Renders a <see cref="TimeSpan"/> as a HOCON duration in milliseconds. Akka's
|
||||
/// HOCON parser accepts a <c>ms</c> suffix, so emitting whole milliseconds
|
||||
/// preserves sub-second configuration exactly — a 750ms heartbeat stays 750ms
|
||||
/// rather than being rounded to <c>1s</c> (or, for sub-half-second values,
|
||||
/// silently collapsing to a degenerate <c>0s</c>) — Host-013.
|
||||
/// </summary>
|
||||
private static string DurationHocon(TimeSpan duration)
|
||||
{
|
||||
return $"{(long)Math.Round(duration.TotalMilliseconds)}ms";
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Renders a value as a HOCON double-quoted string, escaping backslashes and
|
||||
/// double quotes so the resulting token cannot break out of its string literal.
|
||||
/// </summary>
|
||||
private static string QuoteHocon(string? value)
|
||||
{
|
||||
var escaped = (value ?? string.Empty)
|
||||
.Replace("\\", "\\\\")
|
||||
.Replace("\"", "\\\"");
|
||||
return $"\"{escaped}\"";
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Stops the Akka.NET actor system and cleans up resources.
|
||||
/// </summary>
|
||||
/// <param name="cancellationToken">A cancellation token that can be used to cancel the operation.</param>
|
||||
/// <returns>A task representing the asynchronous operation.</returns>
|
||||
public async Task StopAsync(CancellationToken cancellationToken)
|
||||
{
|
||||
// Dispose auxiliary subscribers (e.g. SiteAuditTelemetryStalledTracker)
|
||||
// BEFORE Akka shuts down so their EventStream unsubscribe calls run
|
||||
// while the system is still alive. Per-tracker Dispose is wrapped in
|
||||
// its own try so a misbehaving subscriber can't sink the shutdown.
|
||||
// Snapshot the list inside a lock so a concurrent StartAsync (the
|
||||
// test harness sometimes triggers a second start/stop interleaving)
|
||||
// can't race the enumeration. Clearing the original list under the
|
||||
// same lock leaves the next StartAsync with a clean slate.
|
||||
IDisposable[] disposables;
|
||||
lock (_trackedDisposables)
|
||||
{
|
||||
disposables = _trackedDisposables.ToArray();
|
||||
_trackedDisposables.Clear();
|
||||
}
|
||||
foreach (var disposable in disposables)
|
||||
{
|
||||
try { disposable.Dispose(); }
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(ex,
|
||||
"Auxiliary subscriber {Type} threw during shutdown",
|
||||
disposable.GetType().Name);
|
||||
}
|
||||
}
|
||||
|
||||
if (_actorSystem != null)
|
||||
{
|
||||
_logger.LogInformation("Shutting down Akka.NET actor system via CoordinatedShutdown...");
|
||||
var shutdown = Akka.Actor.CoordinatedShutdown.Get(_actorSystem);
|
||||
await shutdown.Run(Akka.Actor.CoordinatedShutdown.ClrExitReason.Instance);
|
||||
_logger.LogInformation("Akka.NET actor system shutdown complete.");
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Builds the list of cluster roles for this node. Site nodes get both "Site"
|
||||
/// and a site-specific role (e.g., "site-SiteA") to scope singleton placement.
|
||||
/// </summary>
|
||||
private List<string> BuildRoles()
|
||||
{
|
||||
var roles = new List<string> { _nodeOptions.Role };
|
||||
|
||||
if (_nodeOptions.Role.Equals("Site", StringComparison.OrdinalIgnoreCase)
|
||||
&& !string.IsNullOrEmpty(_nodeOptions.SiteId))
|
||||
{
|
||||
roles.Add($"site-{_nodeOptions.SiteId}");
|
||||
}
|
||||
|
||||
return roles;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Registers central-side actors including the CentralCommunicationActor.
|
||||
/// WP-4: Central communication actor routes all 8 message patterns to sites.
|
||||
/// </summary>
|
||||
private void RegisterCentralActors()
|
||||
{
|
||||
// Feed this central node's hostname into the local health collector so
|
||||
// the CentralHealthReportLoop's report identifies the active node.
|
||||
var centralHealthCollector = _serviceProvider.GetService<ZB.MOM.WW.ScadaBridge.HealthMonitoring.ISiteHealthCollector>();
|
||||
centralHealthCollector?.SetNodeHostname(_nodeOptions.NodeHostname);
|
||||
|
||||
var siteClientFactory = new DefaultSiteClientFactory();
|
||||
var centralCommActor = _actorSystem!.ActorOf(
|
||||
Props.Create(() => new CentralCommunicationActor(_serviceProvider, siteClientFactory)),
|
||||
"central-communication");
|
||||
|
||||
// Register CentralCommunicationActor with ClusterClientReceptionist so site ClusterClients can reach it
|
||||
ClusterClientReceptionist.Get(_actorSystem).RegisterService(centralCommActor);
|
||||
_logger.LogInformation("CentralCommunicationActor registered with ClusterClientReceptionist");
|
||||
|
||||
// Wire up the CommunicationService with the actor reference
|
||||
var commService = _serviceProvider.GetService<CommunicationService>();
|
||||
commService?.SetCommunicationActor(centralCommActor);
|
||||
|
||||
// Wire up the DebugStreamService with the ActorSystem
|
||||
var debugStreamService = _serviceProvider.GetService<DebugStreamService>();
|
||||
debugStreamService?.SetActorSystem(_actorSystem!);
|
||||
|
||||
|
||||
// Management Service — accessible via ClusterClient
|
||||
var mgmtLogger = _serviceProvider.GetRequiredService<ILoggerFactory>()
|
||||
.CreateLogger<ZB.MOM.WW.ScadaBridge.ManagementService.ManagementActor>();
|
||||
var mgmtActor = _actorSystem!.ActorOf(
|
||||
Props.Create(() => new ZB.MOM.WW.ScadaBridge.ManagementService.ManagementActor(_serviceProvider, mgmtLogger)),
|
||||
"management");
|
||||
ClusterClientReceptionist.Get(_actorSystem).RegisterService(mgmtActor);
|
||||
var mgmtHolder = _serviceProvider.GetRequiredService<ZB.MOM.WW.ScadaBridge.ManagementService.ManagementActorHolder>();
|
||||
mgmtHolder.ActorRef = mgmtActor;
|
||||
_logger.LogInformation("ManagementActor registered with ClusterClientReceptionist");
|
||||
|
||||
// Notification Outbox — cluster singleton so exactly one node owns ingest,
|
||||
// the dispatch sweep and the purge loop. Central actors run on the base
|
||||
// "Central" role, so the singleton settings are NOT role-scoped (unlike the
|
||||
// site singletons, which are scoped to a per-site role).
|
||||
var outboxOptions = _serviceProvider
|
||||
.GetRequiredService<IOptions<ZB.MOM.WW.ScadaBridge.NotificationOutbox.NotificationOutboxOptions>>().Value;
|
||||
var outboxLogger = _serviceProvider.GetRequiredService<ILoggerFactory>()
|
||||
.CreateLogger<ZB.MOM.WW.ScadaBridge.NotificationOutbox.NotificationOutboxActor>();
|
||||
// M4 Bundle B: central direct-write audit writer for dispatcher attempt
|
||||
// + terminal events. Resolved once from the root provider — the writer
|
||||
// is a singleton and stateless, opening per-call DI scopes internally
|
||||
// to resolve the scoped IAuditLogRepository.
|
||||
var outboxAuditWriter = _serviceProvider
|
||||
.GetRequiredService<ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Services.ICentralAuditWriter>();
|
||||
|
||||
var outboxSingletonProps = ClusterSingletonManager.Props(
|
||||
singletonProps: Props.Create(() => new ZB.MOM.WW.ScadaBridge.NotificationOutbox.NotificationOutboxActor(
|
||||
_serviceProvider,
|
||||
outboxOptions,
|
||||
outboxAuditWriter,
|
||||
outboxLogger)),
|
||||
terminationMessage: PoisonPill.Instance,
|
||||
settings: ClusterSingletonManagerSettings.Create(_actorSystem!)
|
||||
.WithSingletonName("notification-outbox"));
|
||||
_actorSystem!.ActorOf(outboxSingletonProps, "notification-outbox-singleton");
|
||||
|
||||
var outboxProxyProps = ClusterSingletonProxy.Props(
|
||||
singletonManagerPath: "/user/notification-outbox-singleton",
|
||||
settings: ClusterSingletonProxySettings.Create(_actorSystem)
|
||||
.WithSingletonName("notification-outbox"));
|
||||
var outboxProxy = _actorSystem.ActorOf(outboxProxyProps, "notification-outbox-proxy");
|
||||
|
||||
// Hand the outbox proxy to the CentralCommunicationActor so forwarded
|
||||
// NotificationSubmit messages from sites are routed to the outbox singleton.
|
||||
centralCommActor.Tell(new RegisterNotificationOutbox(outboxProxy));
|
||||
|
||||
// Hand the same proxy to the CommunicationService so the Central UI can
|
||||
// Ask the outbox actor directly (query, retry, discard, KPIs).
|
||||
commService?.SetNotificationOutbox(outboxProxy);
|
||||
_logger.LogInformation("NotificationOutbox singleton created and registered with CentralCommunicationActor");
|
||||
|
||||
// Audit Log (#23) — central singleton mirrors the Notification Outbox
|
||||
// pattern. The IngestAuditEvents gRPC handler lives on SiteStreamGrpcServer
|
||||
// (Communication.Grpc); a central node hosting that server (M6 reconciliation
|
||||
// path) hands the proxy in via SetAuditIngestActor below. When the gRPC
|
||||
// server is not registered (current central topology), the host still
|
||||
// brings the singleton up so a Bundle H in-process test (or a future
|
||||
// direct caller) can Ask the proxy without further wiring.
|
||||
// IAuditLogRepository is a SCOPED EF Core service, so the singleton
|
||||
// actor takes the root IServiceProvider and creates a fresh scope per
|
||||
// message (mirroring NotificationOutboxActor). Pre-resolving the
|
||||
// repository here would attempt to take a scoped service from the
|
||||
// root and fail under DI scope validation.
|
||||
var auditIngestLogger = _serviceProvider.GetRequiredService<ILoggerFactory>()
|
||||
.CreateLogger<ZB.MOM.WW.ScadaBridge.AuditLog.Central.AuditLogIngestActor>();
|
||||
|
||||
var auditIngestSingletonProps = ClusterSingletonManager.Props(
|
||||
singletonProps: Props.Create(() => new ZB.MOM.WW.ScadaBridge.AuditLog.Central.AuditLogIngestActor(
|
||||
_serviceProvider,
|
||||
auditIngestLogger)),
|
||||
terminationMessage: PoisonPill.Instance,
|
||||
settings: ClusterSingletonManagerSettings.Create(_actorSystem!)
|
||||
.WithSingletonName("audit-log-ingest"));
|
||||
_actorSystem!.ActorOf(auditIngestSingletonProps, "audit-log-ingest-singleton");
|
||||
|
||||
var auditIngestProxyProps = ClusterSingletonProxy.Props(
|
||||
singletonManagerPath: "/user/audit-log-ingest-singleton",
|
||||
settings: ClusterSingletonProxySettings.Create(_actorSystem)
|
||||
.WithSingletonName("audit-log-ingest"));
|
||||
var auditIngestProxy = _actorSystem.ActorOf(auditIngestProxyProps, "audit-log-ingest-proxy");
|
||||
|
||||
// Hand the audit-ingest proxy to the CentralCommunicationActor so audit
|
||||
// ingest commands forwarded by sites over ClusterClient are routed to the
|
||||
// singleton. Mirrors the RegisterNotificationOutbox wiring above.
|
||||
centralCommActor.Tell(new RegisterAuditIngest(auditIngestProxy));
|
||||
|
||||
// Hand the proxy to the SiteStreamGrpcServer (if registered on this node)
|
||||
// so the IngestAuditEvents RPC routes incoming site batches to the singleton.
|
||||
// The gRPC server is currently only registered on Site nodes; on a central
|
||||
// node this resolves to null and the wiring is a no-op until M6 (which
|
||||
// brings central-hosted gRPC + a real site→central client).
|
||||
var grpcServer = _serviceProvider.GetService<ZB.MOM.WW.ScadaBridge.Communication.Grpc.SiteStreamGrpcServer>();
|
||||
grpcServer?.SetAuditIngestActor(auditIngestProxy);
|
||||
_logger.LogInformation(
|
||||
"AuditLogIngestActor singleton created (gRPC server bound: {GrpcBound})",
|
||||
grpcServer is not null);
|
||||
|
||||
// Audit Log (#23) M6 Bundle E (T7): subscribe the per-site stalled
|
||||
// telemetry tracker to the actor system EventStream NOW that the
|
||||
// system exists. The tracker mirrors every
|
||||
// SiteAuditTelemetryStalledChanged publication (from
|
||||
// SiteAuditReconciliationActor — wired in a later bundle) into the
|
||||
// AuditCentralHealthSnapshot singleton so the central health surface
|
||||
// sees per-site stalled state. The tracker is constructed here rather
|
||||
// than in AddAuditLogCentralMaintenance because its ctor needs an
|
||||
// ActorSystem, which is not a DI-resolvable singleton — it's owned
|
||||
// by this hosted service. The snapshot singleton is resolvable;
|
||||
// passing it in seeds the tracker's Apply() so both internal state
|
||||
// and the snapshot stay in lock-step.
|
||||
var auditCentralSnapshot = _serviceProvider
|
||||
.GetService<ZB.MOM.WW.ScadaBridge.AuditLog.Central.AuditCentralHealthSnapshot>();
|
||||
if (auditCentralSnapshot is not null)
|
||||
{
|
||||
var stalledTracker = new ZB.MOM.WW.ScadaBridge.AuditLog.Central.SiteAuditTelemetryStalledTracker(
|
||||
_actorSystem!, auditCentralSnapshot);
|
||||
lock (_trackedDisposables)
|
||||
{
|
||||
_trackedDisposables.Add(stalledTracker);
|
||||
}
|
||||
_logger.LogInformation("SiteAuditTelemetryStalledTracker subscribed to EventStream");
|
||||
}
|
||||
|
||||
// Site Call Audit (#22) — central singleton mirrors the AuditLogIngest
|
||||
// and NotificationOutbox patterns. M3's dual-write transaction routes
|
||||
// SiteCalls upserts through AuditLogIngestActor's own scope-per-message
|
||||
// ISiteCallAuditRepository resolution, so this singleton is not on the
|
||||
// M3 happy-path hot path; it exists so direct-write callers Ask through
|
||||
// a stable cluster proxy without further wiring. The central→site
|
||||
// Retry/Discard relay now lives in this actor (see the
|
||||
// RegisterCentralCommunication wiring below); the reconciliation puller
|
||||
// is the remaining deferred direct-write caller.
|
||||
// Like AuditLogIngestActor, the actor takes the root IServiceProvider
|
||||
// and creates a fresh scope per message because ISiteCallAuditRepository
|
||||
// is a scoped EF Core service.
|
||||
var siteCallAuditLogger = _serviceProvider.GetRequiredService<ILoggerFactory>()
|
||||
.CreateLogger<ZB.MOM.WW.ScadaBridge.SiteCallAudit.SiteCallAuditActor>();
|
||||
var siteCallAuditOptions = _serviceProvider
|
||||
.GetRequiredService<IOptions<ZB.MOM.WW.ScadaBridge.SiteCallAudit.SiteCallAuditOptions>>().Value;
|
||||
|
||||
var siteCallAuditSingletonProps = ClusterSingletonManager.Props(
|
||||
singletonProps: Props.Create(() => new ZB.MOM.WW.ScadaBridge.SiteCallAudit.SiteCallAuditActor(
|
||||
_serviceProvider,
|
||||
siteCallAuditOptions,
|
||||
siteCallAuditLogger)),
|
||||
terminationMessage: PoisonPill.Instance,
|
||||
settings: ClusterSingletonManagerSettings.Create(_actorSystem!)
|
||||
.WithSingletonName("site-call-audit"));
|
||||
var siteCallAuditSingletonManager =
|
||||
_actorSystem!.ActorOf(siteCallAuditSingletonProps, "site-call-audit-singleton");
|
||||
|
||||
// SiteCallAudit-002 graceful-handover hook. The default singleton handover
|
||||
// path waits for the actor's `ReceiveAsync` task to complete before
|
||||
// signalling `HandOverDone` to the new oldest node — so an in-flight
|
||||
// EF `UpsertAsync` IS waited for during a *clean* coordinated shutdown
|
||||
// (the cluster-leave phase below fires before the singleton terminates).
|
||||
// The risk the finding tracks is the seam between in-flight async work
|
||||
// and the cluster-leave + singleton-stop sequence: we bound it by
|
||||
// issuing an explicit `GracefulStop` to the singleton manager early
|
||||
// in `cluster-leave`, with a timeout that lets the running upsert + SQL
|
||||
// round-trip drain before the handover-to-other-node race window
|
||||
// opens. The timeout is bounded so a misbehaving upsert cannot stall
|
||||
// coordinated shutdown indefinitely — exceeding it falls through to
|
||||
// the existing PoisonPill termination path. Same pattern is suitable
|
||||
// for the NotificationOutbox singleton; not added here to keep this
|
||||
// change minimal (out of NS-020's scope).
|
||||
var siteCallAuditShutdown = Akka.Actor.CoordinatedShutdown.Get(_actorSystem);
|
||||
siteCallAuditShutdown.AddTask(
|
||||
Akka.Actor.CoordinatedShutdown.PhaseClusterLeave,
|
||||
"drain-site-call-audit-singleton",
|
||||
async () =>
|
||||
{
|
||||
try
|
||||
{
|
||||
await siteCallAuditSingletonManager.GracefulStop(TimeSpan.FromSeconds(10));
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(ex,
|
||||
"SiteCallAudit singleton did not drain within the graceful-stop "
|
||||
+ "timeout; falling through to PoisonPill handover");
|
||||
}
|
||||
return Akka.Done.Instance;
|
||||
});
|
||||
|
||||
var siteCallAuditProxyProps = ClusterSingletonProxy.Props(
|
||||
singletonManagerPath: "/user/site-call-audit-singleton",
|
||||
settings: ClusterSingletonProxySettings.Create(_actorSystem)
|
||||
.WithSingletonName("site-call-audit"));
|
||||
var siteCallAuditProxy = _actorSystem.ActorOf(siteCallAuditProxyProps, "site-call-audit-proxy");
|
||||
|
||||
// Hand the proxy to the CommunicationService so the Central UI can Ask
|
||||
// the Site Call Audit actor directly (query, KPIs, detail) — mirrors the
|
||||
// SetNotificationOutbox wiring above.
|
||||
commService?.SetSiteCallAudit(siteCallAuditProxy);
|
||||
|
||||
// Task 5 (#22): hand the CentralCommunicationActor to the SiteCallAudit
|
||||
// actor so it can relay operator Retry/Discard on parked cached calls to
|
||||
// the owning site (over the per-site ClusterClient via SiteEnvelope).
|
||||
// Mirrors the RegisterAuditIngest / RegisterNotificationOutbox wiring;
|
||||
// the message is sent to the singleton proxy so it reaches whichever
|
||||
// central node currently hosts the singleton.
|
||||
siteCallAuditProxy.Tell(
|
||||
new ZB.MOM.WW.ScadaBridge.SiteCallAudit.RegisterCentralCommunication(centralCommActor));
|
||||
_logger.LogInformation(
|
||||
"SiteCallAuditActor singleton created and registered with CentralCommunicationActor");
|
||||
|
||||
_logger.LogInformation("Central actors registered. CentralCommunicationActor created.");
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Registers site-specific actors including the Deployment Manager cluster singleton
|
||||
/// and the SiteCommunicationActor.
|
||||
/// The singleton is scoped to the site-specific cluster role so it runs on exactly
|
||||
/// one node within this site's cluster.
|
||||
/// </summary>
|
||||
private async Task RegisterSiteActorsAsync(CancellationToken cancellationToken)
|
||||
{
|
||||
var siteRole = $"site-{_nodeOptions.SiteId}";
|
||||
var storage = _serviceProvider.GetRequiredService<SiteStorageService>();
|
||||
var compilationService = _serviceProvider.GetRequiredService<ScriptCompilationService>();
|
||||
var sharedScriptLibrary = _serviceProvider.GetRequiredService<SharedScriptLibrary>();
|
||||
var streamManager = _serviceProvider.GetRequiredService<SiteStreamManager>();
|
||||
streamManager.Initialize(_actorSystem!);
|
||||
var siteRuntimeOptionsValue = _serviceProvider.GetService<IOptions<SiteRuntimeOptions>>()?.Value
|
||||
?? new SiteRuntimeOptions();
|
||||
var dmLogger = _serviceProvider.GetRequiredService<ILoggerFactory>()
|
||||
.CreateLogger<DeploymentManagerActor>();
|
||||
|
||||
// WP-34: Create DCL Manager Actor for tag subscriptions
|
||||
var dclFactory = _serviceProvider.GetService<ZB.MOM.WW.ScadaBridge.DataConnectionLayer.IDataConnectionFactory>();
|
||||
var dclOptions = _serviceProvider.GetService<IOptions<ZB.MOM.WW.ScadaBridge.DataConnectionLayer.DataConnectionOptions>>()?.Value
|
||||
?? new ZB.MOM.WW.ScadaBridge.DataConnectionLayer.DataConnectionOptions();
|
||||
IActorRef? dclManager = null;
|
||||
if (dclFactory != null)
|
||||
{
|
||||
var healthCollector = _serviceProvider.GetRequiredService<ZB.MOM.WW.ScadaBridge.HealthMonitoring.ISiteHealthCollector>();
|
||||
var siteEventLogger = _serviceProvider.GetService<ZB.MOM.WW.ScadaBridge.SiteEventLogging.ISiteEventLogger>();
|
||||
dclManager = _actorSystem!.ActorOf(
|
||||
Props.Create(() => new ZB.MOM.WW.ScadaBridge.DataConnectionLayer.Actors.DataConnectionManagerActor(
|
||||
dclFactory, dclOptions, healthCollector, siteEventLogger)),
|
||||
"dcl-manager");
|
||||
_logger.LogInformation("Data Connection Layer manager actor created");
|
||||
}
|
||||
|
||||
// Resolve the health collector for the Deployment Manager
|
||||
var siteHealthCollector = _serviceProvider.GetService<ZB.MOM.WW.ScadaBridge.HealthMonitoring.ISiteHealthCollector>();
|
||||
siteHealthCollector?.SetNodeHostname(_nodeOptions.NodeHostname);
|
||||
|
||||
// Create SiteReplicationActor on every node (not a singleton)
|
||||
var sfStorage = _serviceProvider.GetRequiredService<StoreAndForwardStorage>();
|
||||
var replicationService = _serviceProvider.GetRequiredService<ReplicationService>();
|
||||
var replicationLogger = _serviceProvider.GetRequiredService<ILoggerFactory>()
|
||||
.CreateLogger<SiteReplicationActor>();
|
||||
|
||||
var replicationActor = _actorSystem!.ActorOf(
|
||||
Props.Create(() => new SiteReplicationActor(
|
||||
storage, sfStorage, replicationService, siteRole, replicationLogger)),
|
||||
"site-replication");
|
||||
|
||||
// Wire S&F replication handler to forward operations via the replication actor
|
||||
replicationService.SetReplicationHandler(op =>
|
||||
{
|
||||
replicationActor.Tell(new ReplicateStoreAndForward(op));
|
||||
return Task.CompletedTask;
|
||||
});
|
||||
|
||||
_logger.LogInformation("SiteReplicationActor created and S&F replication handler wired");
|
||||
|
||||
// Create the Deployment Manager as a cluster singleton
|
||||
var singletonProps = ClusterSingletonManager.Props(
|
||||
singletonProps: Props.Create(() => new DeploymentManagerActor(
|
||||
storage,
|
||||
compilationService,
|
||||
sharedScriptLibrary,
|
||||
streamManager,
|
||||
siteRuntimeOptionsValue,
|
||||
dmLogger,
|
||||
dclManager,
|
||||
replicationActor,
|
||||
siteHealthCollector,
|
||||
_serviceProvider)),
|
||||
terminationMessage: PoisonPill.Instance,
|
||||
settings: ClusterSingletonManagerSettings.Create(_actorSystem!)
|
||||
.WithRole(siteRole)
|
||||
.WithSingletonName("deployment-manager"));
|
||||
|
||||
_actorSystem!.ActorOf(singletonProps, "deployment-manager-singleton");
|
||||
|
||||
// Create a proxy for other actors to communicate with the singleton
|
||||
var proxyProps = ClusterSingletonProxy.Props(
|
||||
singletonManagerPath: "/user/deployment-manager-singleton",
|
||||
settings: ClusterSingletonProxySettings.Create(_actorSystem)
|
||||
.WithRole(siteRole)
|
||||
.WithSingletonName("deployment-manager"));
|
||||
|
||||
var dmProxy = _actorSystem.ActorOf(proxyProps, "deployment-manager-proxy");
|
||||
|
||||
// WP-4: Create SiteCommunicationActor for receiving messages from central
|
||||
var siteCommActor = _actorSystem.ActorOf(
|
||||
Props.Create(() => new SiteCommunicationActor(
|
||||
_nodeOptions.SiteId!,
|
||||
_communicationOptions,
|
||||
dmProxy)),
|
||||
"site-communication");
|
||||
|
||||
// Register local handlers with SiteCommunicationActor
|
||||
siteCommActor.Tell(new RegisterLocalHandler(LocalHandlerType.Artifacts, dmProxy));
|
||||
|
||||
// Event log handler — cluster singleton so queries always reach the
|
||||
// active node. The event log is node-local SQLite and is not
|
||||
// replicated; only the active node records events. A per-node handler
|
||||
// would let a ClusterClient query land on the standby and find nothing.
|
||||
var eventLogQueryService = _serviceProvider.GetService<SiteEventLogging.IEventLogQueryService>();
|
||||
if (eventLogQueryService != null)
|
||||
{
|
||||
var eventLogSingletonProps = ClusterSingletonManager.Props(
|
||||
singletonProps: Props.Create(() => new SiteEventLogging.EventLogHandlerActor(eventLogQueryService)),
|
||||
terminationMessage: PoisonPill.Instance,
|
||||
settings: ClusterSingletonManagerSettings.Create(_actorSystem)
|
||||
.WithRole(siteRole)
|
||||
.WithSingletonName("event-log-handler"));
|
||||
_actorSystem.ActorOf(eventLogSingletonProps, "event-log-handler-singleton");
|
||||
|
||||
var eventLogProxyProps = ClusterSingletonProxy.Props(
|
||||
singletonManagerPath: "/user/event-log-handler-singleton",
|
||||
settings: ClusterSingletonProxySettings.Create(_actorSystem)
|
||||
.WithRole(siteRole)
|
||||
.WithSingletonName("event-log-handler"));
|
||||
var eventLogProxy = _actorSystem.ActorOf(eventLogProxyProps, "event-log-handler-proxy");
|
||||
|
||||
siteCommActor.Tell(new RegisterLocalHandler(LocalHandlerType.EventLog, eventLogProxy));
|
||||
}
|
||||
|
||||
// Parked message handler — bridges Akka to StoreAndForwardService
|
||||
var storeAndForwardService = _serviceProvider.GetService<StoreAndForwardService>();
|
||||
if (storeAndForwardService != null)
|
||||
{
|
||||
// Initialize SQLite schema and start the retry timer. Must complete before
|
||||
// any actor or HTTP handler touches the service. Host-005: awaited rather
|
||||
// than blocked via GetAwaiter().GetResult() — no thread-pool starvation /
|
||||
// sync-context deadlock risk, and exceptions surface as their original type.
|
||||
cancellationToken.ThrowIfCancellationRequested();
|
||||
await storeAndForwardService.StartAsync();
|
||||
|
||||
// Register the store-and-forward delivery handlers so buffered
|
||||
// ExternalSystem calls, cached DB writes and notifications are actually
|
||||
// delivered by the retry sweep. Without this, every buffered message is
|
||||
// persisted but never delivered. Each handler resolves its scoped consumer
|
||||
// service in a fresh DI scope — the sweep runs on a timer thread, outside
|
||||
// any request scope.
|
||||
storeAndForwardService.RegisterDeliveryHandler(
|
||||
ZB.MOM.WW.ScadaBridge.Commons.Types.Enums.StoreAndForwardCategory.ExternalSystem,
|
||||
async msg =>
|
||||
{
|
||||
using var scope = _serviceProvider.CreateScope();
|
||||
return await scope.ServiceProvider
|
||||
.GetRequiredService<ZB.MOM.WW.ScadaBridge.ExternalSystemGateway.ExternalSystemClient>()
|
||||
.DeliverBufferedAsync(msg);
|
||||
});
|
||||
storeAndForwardService.RegisterDeliveryHandler(
|
||||
ZB.MOM.WW.ScadaBridge.Commons.Types.Enums.StoreAndForwardCategory.CachedDbWrite,
|
||||
async msg =>
|
||||
{
|
||||
using var scope = _serviceProvider.CreateScope();
|
||||
return await scope.ServiceProvider
|
||||
.GetRequiredService<ZB.MOM.WW.ScadaBridge.ExternalSystemGateway.DatabaseGateway>()
|
||||
.DeliverBufferedAsync(msg);
|
||||
});
|
||||
// Notification Outbox: a buffered notification is no longer delivered by
|
||||
// the site over SMTP. "Delivering" it means forwarding it to the central
|
||||
// cluster via the SiteCommunicationActor and treating central's
|
||||
// NotificationSubmitAck as the outcome (accepted → delivered; not accepted
|
||||
// or timeout → throw → transient → keep buffering). Central owns SMTP.
|
||||
//
|
||||
// NotificationService-020: register exactly once. The sentinel guard
|
||||
// catches a second registration path that re-introduces the dead
|
||||
// NS-001 site-SMTP handler — see the sentinel's XML doc above for the
|
||||
// historical context. Throwing here is intentional: a silent overwrite
|
||||
// by a future maintainer would invert the design back to site-side
|
||||
// delivery (NotificationForwarder vs. NotificationDeliveryService).
|
||||
if (_notificationDeliveryHandlerRegistered)
|
||||
{
|
||||
throw new InvalidOperationException(
|
||||
"NotificationService-020: A Notification-category store-and-forward "
|
||||
+ "delivery handler was already registered. The canonical handler is "
|
||||
+ "NotificationForwarder (central-only delivery, post-redesign). "
|
||||
+ "If you are re-introducing a second registration path, remove the "
|
||||
+ "first one — RegisterDeliveryHandler is last-write-wins per category "
|
||||
+ "and a duplicate inverts the design.");
|
||||
}
|
||||
var notificationForwarder = new ZB.MOM.WW.ScadaBridge.StoreAndForward.NotificationForwarder(
|
||||
siteCommActor,
|
||||
_nodeOptions.SiteId!,
|
||||
_communicationOptions.NotificationForwardTimeout);
|
||||
storeAndForwardService.RegisterDeliveryHandler(
|
||||
ZB.MOM.WW.ScadaBridge.Commons.Types.Enums.StoreAndForwardCategory.Notification,
|
||||
notificationForwarder.DeliverAsync);
|
||||
_notificationDeliveryHandlerRegistered = true;
|
||||
_logger.LogInformation(
|
||||
"Store-and-forward delivery handlers registered (ExternalSystem, CachedDbWrite, Notification)");
|
||||
|
||||
var parkedMessageHandler = _actorSystem.ActorOf(
|
||||
Props.Create(() => new ParkedMessageHandlerActor(
|
||||
storeAndForwardService, _nodeOptions.SiteId!)),
|
||||
"parked-message-handler");
|
||||
siteCommActor.Tell(new RegisterLocalHandler(LocalHandlerType.ParkedMessages, parkedMessageHandler));
|
||||
}
|
||||
|
||||
// Register SiteCommunicationActor with ClusterClientReceptionist so central ClusterClients can reach it
|
||||
ClusterClientReceptionist.Get(_actorSystem).RegisterService(siteCommActor);
|
||||
|
||||
_logger.LogInformation(
|
||||
"Site actors registered. DeploymentManager singleton scoped to role={SiteRole}, SiteCommunicationActor created.",
|
||||
siteRole);
|
||||
|
||||
// Create ClusterClient to central if contact points are configured
|
||||
if (_communicationOptions.CentralContactPoints.Count > 0)
|
||||
{
|
||||
var contacts = _communicationOptions.CentralContactPoints
|
||||
.Select(cp => ActorPath.Parse($"{cp}/system/receptionist"))
|
||||
.ToImmutableHashSet();
|
||||
var clientSettings = ClusterClientSettings.Create(_actorSystem)
|
||||
.WithInitialContacts(contacts);
|
||||
var centralClient = _actorSystem.ActorOf(
|
||||
ClusterClient.Props(clientSettings), "central-cluster-client");
|
||||
|
||||
var siteCommSelection = _actorSystem.ActorSelection("/user/site-communication");
|
||||
siteCommSelection.Tell(new RegisterCentralClient(centralClient));
|
||||
|
||||
_logger.LogInformation(
|
||||
"Created ClusterClient to central with {Count} contact point(s) for site {SiteId}",
|
||||
contacts.Count, _nodeOptions.SiteId);
|
||||
}
|
||||
|
||||
// Audit Log (#23) — site-side telemetry actor that drains the SQLite
|
||||
// Pending queue and pushes to central via IngestAuditEvents. Not a
|
||||
// cluster singleton: each site is its own cluster, and the actor reads
|
||||
// node-local SQLite (no replication). The Props are bound to the
|
||||
// dedicated audit-telemetry-dispatcher (defined in BuildHocon) so a
|
||||
// batch SQLite read + gRPC push never contend with the default
|
||||
// dispatcher used by hot-path actors.
|
||||
//
|
||||
// Per Bundle E's brief: the SiteAuditTelemetryActor takes its
|
||||
// collaborators through its constructor, so we resolve them from DI
|
||||
// and pass them in via Props.Create rather than relying on a future
|
||||
// FactoryProvider. The real site→central client is constructed and
|
||||
// wired immediately below: a ClusterClientSiteAuditClient (ClusterClient
|
||||
// transport, not gRPC) replaces the DI-default NoOpSiteStreamAuditClient
|
||||
// for site roles, without disturbing the rest of this wiring.
|
||||
var siteAuditOptions = _serviceProvider
|
||||
.GetRequiredService<IOptions<ZB.MOM.WW.ScadaBridge.AuditLog.Site.Telemetry.SiteAuditTelemetryOptions>>();
|
||||
var siteAuditQueue = _serviceProvider
|
||||
.GetRequiredService<ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Services.ISiteAuditQueue>();
|
||||
// Audit Log (#23) Task 2 follow-up: the production site→central audit
|
||||
// push uses the ClusterClient transport via the SiteCommunicationActor,
|
||||
// not the DI-resolved NoOpSiteStreamAuditClient. The NoOp default stays
|
||||
// correct for central/test composition roots (no SiteCommunicationActor);
|
||||
// a site role wires the real ClusterClient-based client here so the
|
||||
// SQLite Pending backlog actually drains to central. The forward Ask
|
||||
// reuses NotificationForwardTimeout — the same site→central command
|
||||
// forward bound notifications already use over this transport.
|
||||
ZB.MOM.WW.ScadaBridge.AuditLog.Site.Telemetry.ISiteStreamAuditClient siteAuditClient =
|
||||
new ZB.MOM.WW.ScadaBridge.AuditLog.Site.Telemetry.ClusterClientSiteAuditClient(
|
||||
siteCommActor,
|
||||
_communicationOptions.NotificationForwardTimeout);
|
||||
var siteAuditLogger = _serviceProvider.GetRequiredService<ILoggerFactory>()
|
||||
.CreateLogger<ZB.MOM.WW.ScadaBridge.AuditLog.Site.Telemetry.SiteAuditTelemetryActor>();
|
||||
|
||||
// AuditLog-001: resolve the site-local operation tracking store so the
|
||||
// actor can run the combined-telemetry cached-drain in parallel with
|
||||
// the audit-only drain. The store is registered by AddSiteRuntime on
|
||||
// site composition roots; on central it is intentionally absent and
|
||||
// the cached-drain scheduler is never armed (the central side has no
|
||||
// outbound cached calls to track). GetService — null when not
|
||||
// registered — matches the optional-param contract on the actor ctor.
|
||||
var siteTrackingStore = _serviceProvider
|
||||
.GetService<ZB.MOM.WW.ScadaBridge.Commons.Interfaces.IOperationTrackingStore>();
|
||||
|
||||
var siteAuditTelemetryProps = Props.Create(() =>
|
||||
new ZB.MOM.WW.ScadaBridge.AuditLog.Site.Telemetry.SiteAuditTelemetryActor(
|
||||
siteAuditQueue,
|
||||
siteAuditClient,
|
||||
siteAuditOptions,
|
||||
siteAuditLogger,
|
||||
siteTrackingStore))
|
||||
.WithDispatcher("audit-telemetry-dispatcher");
|
||||
_actorSystem.ActorOf(siteAuditTelemetryProps, "site-audit-telemetry");
|
||||
_logger.LogInformation(
|
||||
"SiteAuditTelemetryActor created (dispatcher=audit-telemetry-dispatcher, client={ClientType}, cachedDrain={CachedDrainEnabled})",
|
||||
siteAuditClient.GetType().Name,
|
||||
siteTrackingStore is not null);
|
||||
|
||||
// Gate gRPC subscriptions until the actor system and SiteStreamManager are
|
||||
// initialized (REQ-HOST-7).
|
||||
//
|
||||
// Host-009: SetReady asserts a deliberately narrow contract. By this point the
|
||||
// actor system exists, SiteStreamManager.Initialize has run, and every
|
||||
// role actor (SiteCommunicationActor, deployment-manager singleton,
|
||||
// SiteReplicationActor, the ClusterClient) has been created with ActorOf —
|
||||
// creation and the registration Tells are synchronous and strictly ordered.
|
||||
// What is NOT guaranteed is completion of each actor's PreStart or the
|
||||
// ClusterClient's initial-contact handshake with central: those are
|
||||
// intentionally asynchronous. Gating readiness on the central handshake would
|
||||
// be wrong — a site must come up and stream locally even while central is
|
||||
// briefly unreachable. gRPC readiness therefore guarantees "the site actor
|
||||
// graph exists and can accept subscription streams", not "the cluster
|
||||
// handshake has completed". Streams opened before SetReady are already
|
||||
// rejected by SiteStreamGrpcServer with StatusCode.Unavailable.
|
||||
var grpcServer = _serviceProvider.GetService<ZB.MOM.WW.ScadaBridge.Communication.Grpc.SiteStreamGrpcServer>();
|
||||
// Audit Log (#23 M6): hand the site-local SqliteAuditWriter (which
|
||||
// implements ISiteAuditQueue) to the gRPC server so the PullAuditEvents
|
||||
// reconciliation RPC can serve central's pulls. Both the writer and the
|
||||
// gRPC server are singletons — wiring this here keeps the dependency
|
||||
// direction one-way (Host knows both; Communication doesn't reach back
|
||||
// into AuditLog).
|
||||
grpcServer?.SetSiteAuditQueue(siteAuditQueue);
|
||||
grpcServer?.SetReady(_actorSystem!);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,65 @@
|
||||
using Akka.Actor;
|
||||
using Akka.Event;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using ZB.MOM.WW.ScadaBridge.HealthMonitoring;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.Host.Actors;
|
||||
|
||||
/// <summary>
|
||||
/// Subscribes to Akka.NET dead letter events, logs them, and tracks count
|
||||
/// for health monitoring integration.
|
||||
/// </summary>
|
||||
public class DeadLetterMonitorActor : ReceiveActor
|
||||
{
|
||||
private long _deadLetterCount;
|
||||
private readonly ISiteHealthCollector? _healthCollector;
|
||||
|
||||
/// <summary>
|
||||
/// Initializes the actor and registers dead-letter message handlers.
|
||||
/// </summary>
|
||||
/// <param name="logger">Logger for dead-letter events.</param>
|
||||
/// <param name="healthCollector">Optional health collector used to increment the dead-letter metric.</param>
|
||||
public DeadLetterMonitorActor(ILogger<DeadLetterMonitorActor> logger, ISiteHealthCollector? healthCollector = null)
|
||||
{
|
||||
_healthCollector = healthCollector;
|
||||
|
||||
Receive<DeadLetter>(dl =>
|
||||
{
|
||||
_deadLetterCount++;
|
||||
_healthCollector?.IncrementDeadLetter();
|
||||
logger.LogWarning(
|
||||
"Dead letter: {MessageType} from {Sender} to {Recipient}",
|
||||
dl.Message.GetType().Name,
|
||||
dl.Sender,
|
||||
dl.Recipient);
|
||||
});
|
||||
|
||||
Receive<GetDeadLetterCount>(_ => Sender.Tell(new DeadLetterCountResponse(_deadLetterCount)));
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
protected override void PreStart()
|
||||
{
|
||||
Context.System.EventStream.Subscribe(Self, typeof(DeadLetter));
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
protected override void PostStop()
|
||||
{
|
||||
Context.System.EventStream.Unsubscribe(Self, typeof(DeadLetter));
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Message to request the current dead letter count.
|
||||
/// </summary>
|
||||
public sealed class GetDeadLetterCount
|
||||
{
|
||||
public static readonly GetDeadLetterCount Instance = new();
|
||||
private GetDeadLetterCount() { }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Response containing the current dead letter count.
|
||||
/// </summary>
|
||||
public sealed record DeadLetterCountResponse(long Count);
|
||||
@@ -0,0 +1,34 @@
|
||||
using Akka.Actor;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Health;
|
||||
using ZB.MOM.WW.ScadaBridge.HealthMonitoring;
|
||||
using ZB.MOM.WW.ScadaBridge.Host.Actors;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.Host;
|
||||
|
||||
/// <summary>
|
||||
/// Sends SiteHealthReport to the local SiteCommunicationActor via Akka ActorSelection.
|
||||
/// The SiteCommunicationActor forwards it to central.
|
||||
/// </summary>
|
||||
public class AkkaHealthReportTransport : IHealthReportTransport
|
||||
{
|
||||
private readonly AkkaHostedService _akkaService;
|
||||
|
||||
/// <summary>
|
||||
/// Initializes a new <see cref="AkkaHealthReportTransport"/> backed by the given Akka hosted service.
|
||||
/// </summary>
|
||||
/// <param name="akkaService">The Akka hosted service used to access the running actor system.</param>
|
||||
public AkkaHealthReportTransport(AkkaHostedService akkaService)
|
||||
{
|
||||
_akkaService = akkaService;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public void Send(SiteHealthReport report)
|
||||
{
|
||||
var actorSystem = _akkaService.ActorSystem;
|
||||
if (actorSystem == null) return;
|
||||
|
||||
var siteComm = actorSystem.ActorSelection("/user/site-communication");
|
||||
siteComm.Tell(report, ActorRefs.NoSender);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,86 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="utf-8" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
||||
<base href="/" />
|
||||
<title>ScadaBridge</title>
|
||||
<link href="/lib/bootstrap/css/bootstrap.min.css" rel="stylesheet" />
|
||||
<link href="/lib/bootstrap-icons/bootstrap-icons.css" rel="stylesheet" />
|
||||
<link href="_content/ZB.MOM.WW.ScadaBridge.CentralUI/css/theme.css" rel="stylesheet" />
|
||||
<link href="/ZB.MOM.WW.ScadaBridge.Host.styles.css" rel="stylesheet" />
|
||||
<link href="_content/ZB.MOM.WW.ScadaBridge.CentralUI/css/site.css" rel="stylesheet" />
|
||||
<HeadOutlet @rendermode="InteractiveServer" />
|
||||
</head>
|
||||
<body>
|
||||
<Routes @rendermode="InteractiveServer" />
|
||||
|
||||
<div id="reconnect-modal">
|
||||
<div class="modal-dialog modal-dialog-centered">
|
||||
<div class="modal-content">
|
||||
<div class="spinner-border text-primary mb-3" role="status">
|
||||
<span class="visually-hidden">Reconnecting...</span>
|
||||
</div>
|
||||
<h5>Connection Lost</h5>
|
||||
<p class="text-muted mb-0">Attempting to reconnect to the server. Please wait...</p>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<script src="/_framework/blazor.web.js"
|
||||
autostart="false"></script>
|
||||
<script>
|
||||
// Reconnection overlay for failover behavior. After a docker redeploy
|
||||
// (or other server-side restart), Blazor exhausts its retry budget and
|
||||
// leaves the user staring at a stuck "Reconnect failed" overlay. Auto-
|
||||
// reload in that case so the user lands on a fresh circuit instead of
|
||||
// having to manually refresh.
|
||||
Blazor.start({
|
||||
circuit: {
|
||||
reconnectionOptions: {
|
||||
maxRetries: 8,
|
||||
retryIntervalMilliseconds: 1500
|
||||
},
|
||||
reconnectionHandler: {
|
||||
onConnectionDown: () => { /* default overlay */ },
|
||||
onConnectionUp: () => {
|
||||
var m = document.getElementById('reconnect-modal');
|
||||
if (m) m.style.display = 'none';
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
document.addEventListener('DOMContentLoaded', () => {
|
||||
if (typeof Blazor !== 'undefined') {
|
||||
Blazor.addEventListener?.('enhancedload', () => {
|
||||
var m = document.getElementById('reconnect-modal');
|
||||
if (m) m.style.display = 'none';
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
// When Blazor gives up reconnecting, it adds the
|
||||
// `components-reconnect-failed` class to the reconnect modal element.
|
||||
// Watch for it and auto-reload so the user gets a fresh circuit.
|
||||
var mo = new MutationObserver(() => {
|
||||
var m = document.getElementById('reconnect-modal');
|
||||
if (!m) return;
|
||||
if (m.classList.contains('components-reconnect-failed')) {
|
||||
window.location.reload();
|
||||
}
|
||||
});
|
||||
mo.observe(document.documentElement, {
|
||||
attributes: true,
|
||||
subtree: true,
|
||||
attributeFilter: ['class']
|
||||
});
|
||||
</script>
|
||||
<script src="/js/treeview-storage.js"></script>
|
||||
<script src="_content/ZB.MOM.WW.ScadaBridge.CentralUI/js/nav-state.js"></script>
|
||||
<script src="_content/ZB.MOM.WW.ScadaBridge.CentralUI/js/monaco-init.js"></script>
|
||||
<script src="_content/ZB.MOM.WW.ScadaBridge.CentralUI/js/audit-grid.js"></script>
|
||||
<script src="_content/ZB.MOM.WW.ScadaBridge.CentralUI/js/transport.js"></script>
|
||||
<script src="/lib/bootstrap/js/bootstrap.bundle.min.js"></script>
|
||||
</body>
|
||||
</html>
|
||||
@@ -0,0 +1,31 @@
|
||||
<CascadingAuthenticationState>
|
||||
<Router AppAssembly="typeof(Routes).Assembly"
|
||||
AdditionalAssemblies="new[] { typeof(ZB.MOM.WW.ScadaBridge.CentralUI.Components.Layout.MainLayout).Assembly }">
|
||||
<Found Context="routeData">
|
||||
<AuthorizeRouteView RouteData="routeData" DefaultLayout="typeof(ZB.MOM.WW.ScadaBridge.CentralUI.Components.Layout.MainLayout)">
|
||||
<NotAuthorized>
|
||||
@if (context.User.Identity?.IsAuthenticated != true)
|
||||
{
|
||||
<RedirectToLogin />
|
||||
}
|
||||
else
|
||||
{
|
||||
<NotAuthorizedView />
|
||||
}
|
||||
</NotAuthorized>
|
||||
<Authorizing>
|
||||
<p class="text-muted p-3">Checking authorization...</p>
|
||||
</Authorizing>
|
||||
</AuthorizeRouteView>
|
||||
</Found>
|
||||
<NotFound>
|
||||
<LayoutView Layout="typeof(ZB.MOM.WW.ScadaBridge.CentralUI.Components.Layout.MainLayout)">
|
||||
<div class="container mt-5">
|
||||
<h3>Page Not Found</h3>
|
||||
<p class="text-muted">The requested page does not exist.</p>
|
||||
<a href="/" class="btn btn-outline-primary btn-sm">Return to Dashboard</a>
|
||||
</div>
|
||||
</LayoutView>
|
||||
</NotFound>
|
||||
</Router>
|
||||
</CascadingAuthenticationState>
|
||||
@@ -0,0 +1,12 @@
|
||||
namespace ZB.MOM.WW.ScadaBridge.Host;
|
||||
|
||||
/// <summary>
|
||||
/// Database connection string and path options bound from the <c>ScadaBridge:Database</c> configuration section.
|
||||
/// </summary>
|
||||
public class DatabaseOptions
|
||||
{
|
||||
/// <summary>Connection string for the central configuration SQL Server database.</summary>
|
||||
public string? ConfigurationDb { get; set; }
|
||||
/// <summary>File system path to the site-local SQLite database directory.</summary>
|
||||
public string? SiteDbPath { get; set; }
|
||||
}
|
||||
@@ -0,0 +1,57 @@
|
||||
using Akka.Cluster;
|
||||
using ZB.MOM.WW.ScadaBridge.Host.Actors;
|
||||
using ZB.MOM.WW.ScadaBridge.InboundAPI;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.Host.Health;
|
||||
|
||||
/// <summary>
|
||||
/// InboundAPI-008 / InboundAPI-022: production implementation of
|
||||
/// <see cref="IActiveNodeGate"/> backed by the running Akka.NET cluster.
|
||||
///
|
||||
/// The inbound API is "Central cluster only (active node)" — a standby central
|
||||
/// node must not execute method scripts or <c>Route.To()</c> calls. This gate
|
||||
/// mirrors the leadership check in <see cref="ActiveNodeHealthCheck"/> (the
|
||||
/// node is the cluster leader, <see cref="MemberStatus.Up"/>), so
|
||||
/// <see cref="InboundApiEndpointFilter"/> can return HTTP 503 on a standby.
|
||||
///
|
||||
/// Registered only in the Central-role branch of <c>Program.cs</c>. The gate
|
||||
/// is resolved per request from <c>HttpContext.RequestServices</c>; while the
|
||||
/// <c>AkkaHostedService</c> is still warming up (<c>ActorSystem == null</c>)
|
||||
/// or the node has not yet reached <see cref="MemberStatus.Up"/>, this
|
||||
/// implementation reports <c>IsActiveNode == false</c> — the safe-by-default
|
||||
/// answer matching the standby case.
|
||||
/// </summary>
|
||||
public sealed class ActiveNodeGate : IActiveNodeGate
|
||||
{
|
||||
private readonly AkkaHostedService _akkaService;
|
||||
|
||||
/// <summary>Initializes a new <see cref="ActiveNodeGate"/> bound to the given Akka hosted service.</summary>
|
||||
/// <param name="akkaService">The Akka hosted service exposing the cluster's <see cref="Akka.Actor.ActorSystem"/>.</param>
|
||||
public ActiveNodeGate(AkkaHostedService akkaService)
|
||||
{
|
||||
_akkaService = akkaService;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// <c>true</c> only when this node has joined the cluster (<see cref="MemberStatus.Up"/>)
|
||||
/// AND is the current cluster leader; <c>false</c> in every other state
|
||||
/// (actor system not yet started, node still joining, node is a standby).
|
||||
/// </summary>
|
||||
public bool IsActiveNode
|
||||
{
|
||||
get
|
||||
{
|
||||
var system = _akkaService.ActorSystem;
|
||||
if (system == null)
|
||||
return false;
|
||||
|
||||
var cluster = Cluster.Get(system);
|
||||
var self = cluster.SelfMember;
|
||||
if (self.Status != MemberStatus.Up)
|
||||
return false;
|
||||
|
||||
var leader = cluster.State.Leader;
|
||||
return leader != null && leader == self.Address;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,45 @@
|
||||
using Akka.Cluster;
|
||||
using Microsoft.Extensions.Diagnostics.HealthChecks;
|
||||
using ZB.MOM.WW.ScadaBridge.Host.Actors;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.Host.Health;
|
||||
|
||||
/// <summary>
|
||||
/// Health check that returns healthy only if this node is the active (leader) node
|
||||
/// in the Akka.NET cluster. Used by Traefik to route traffic to the active node.
|
||||
/// </summary>
|
||||
public class ActiveNodeHealthCheck : IHealthCheck
|
||||
{
|
||||
private readonly AkkaHostedService _akkaService;
|
||||
|
||||
/// <summary>Initializes a new <see cref="ActiveNodeHealthCheck"/> with the given Akka hosted service.</summary>
|
||||
/// <param name="akkaService">The Akka hosted service providing access to the actor system and cluster state.</param>
|
||||
public ActiveNodeHealthCheck(AkkaHostedService akkaService)
|
||||
{
|
||||
_akkaService = akkaService;
|
||||
}
|
||||
|
||||
/// <summary>Returns healthy if this node is the cluster leader (active node); otherwise returns unhealthy.</summary>
|
||||
/// <param name="context">Health check context providing registration details.</param>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
public Task<HealthCheckResult> CheckHealthAsync(
|
||||
HealthCheckContext context,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
var system = _akkaService.ActorSystem;
|
||||
if (system == null)
|
||||
return Task.FromResult(HealthCheckResult.Unhealthy("ActorSystem not yet available."));
|
||||
|
||||
var cluster = Cluster.Get(system);
|
||||
var self = cluster.SelfMember;
|
||||
|
||||
if (self.Status != MemberStatus.Up)
|
||||
return Task.FromResult(HealthCheckResult.Unhealthy($"Node not Up (status: {self.Status})."));
|
||||
|
||||
var leader = cluster.State.Leader;
|
||||
if (leader != null && leader == self.Address)
|
||||
return Task.FromResult(HealthCheckResult.Healthy("Active node (cluster leader)."));
|
||||
|
||||
return Task.FromResult(HealthCheckResult.Unhealthy("Standby node (not cluster leader)."));
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,52 @@
|
||||
using Akka.Cluster;
|
||||
using Microsoft.Extensions.Diagnostics.HealthChecks;
|
||||
using ZB.MOM.WW.ScadaBridge.Host.Actors;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.Host.Health;
|
||||
|
||||
/// <summary>
|
||||
/// Health check that verifies this node is an active member of the Akka.NET cluster.
|
||||
/// Returns healthy only if the node's self-member status is Up or Joining.
|
||||
/// </summary>
|
||||
public class AkkaClusterHealthCheck : IHealthCheck
|
||||
{
|
||||
private readonly AkkaHostedService _akkaService;
|
||||
|
||||
/// <summary>
|
||||
/// Initializes the health check with the Akka hosted service.
|
||||
/// </summary>
|
||||
/// <param name="akkaService">The hosted service providing access to the Akka actor system.</param>
|
||||
public AkkaClusterHealthCheck(AkkaHostedService akkaService)
|
||||
{
|
||||
_akkaService = akkaService;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Checks that this node is an active member of the Akka.NET cluster.
|
||||
/// </summary>
|
||||
/// <param name="context">Health check context.</param>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
public Task<HealthCheckResult> CheckHealthAsync(
|
||||
HealthCheckContext context,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
var system = _akkaService.ActorSystem;
|
||||
if (system == null)
|
||||
return Task.FromResult(HealthCheckResult.Degraded("ActorSystem not yet available."));
|
||||
|
||||
var cluster = Cluster.Get(system);
|
||||
var status = cluster.SelfMember.Status;
|
||||
|
||||
var result = status switch
|
||||
{
|
||||
MemberStatus.Up or MemberStatus.Joining =>
|
||||
HealthCheckResult.Healthy($"Akka cluster member status: {status}"),
|
||||
MemberStatus.Leaving or MemberStatus.Exiting =>
|
||||
HealthCheckResult.Degraded($"Akka cluster member status: {status}"),
|
||||
_ =>
|
||||
HealthCheckResult.Unhealthy($"Akka cluster member status: {status}")
|
||||
};
|
||||
|
||||
return Task.FromResult(result);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,82 @@
|
||||
using Akka.Actor;
|
||||
using Akka.Cluster;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Health;
|
||||
using ZB.MOM.WW.ScadaBridge.HealthMonitoring;
|
||||
using ZB.MOM.WW.ScadaBridge.Host.Actors;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.Host.Health;
|
||||
|
||||
/// <summary>
|
||||
/// Provides cluster node statuses from Akka.NET cluster membership for health reporting.
|
||||
/// </summary>
|
||||
public class AkkaClusterNodeProvider : IClusterNodeProvider
|
||||
{
|
||||
private readonly AkkaHostedService _akkaService;
|
||||
private readonly string _siteRole;
|
||||
|
||||
/// <summary>
|
||||
/// Initializes a new <see cref="AkkaClusterNodeProvider"/>.
|
||||
/// </summary>
|
||||
/// <param name="akkaService">The Akka hosted service providing access to the actor system.</param>
|
||||
/// <param name="siteRole">The Akka cluster role used to filter relevant member nodes.</param>
|
||||
public AkkaClusterNodeProvider(AkkaHostedService akkaService, string siteRole)
|
||||
{
|
||||
_akkaService = akkaService;
|
||||
_siteRole = siteRole;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public bool SelfIsPrimary
|
||||
{
|
||||
get
|
||||
{
|
||||
var system = _akkaService.ActorSystem;
|
||||
if (system == null) return false;
|
||||
var cluster = Cluster.Get(system);
|
||||
if (cluster.SelfMember.Status != MemberStatus.Up) return false;
|
||||
var leader = cluster.State.Leader;
|
||||
return leader != null && leader.Equals(cluster.SelfAddress);
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public IReadOnlyList<NodeStatus> GetClusterNodes()
|
||||
{
|
||||
var system = _akkaService.ActorSystem;
|
||||
if (system == null) return [];
|
||||
|
||||
var cluster = Cluster.Get(system);
|
||||
var selfAddress = cluster.SelfAddress;
|
||||
var leader = cluster.State.Leader;
|
||||
|
||||
var nodes = new List<NodeStatus>();
|
||||
foreach (var member in cluster.State.Members)
|
||||
{
|
||||
if (!member.HasRole(_siteRole))
|
||||
continue;
|
||||
|
||||
var hostname = member.Address.Host ?? member.Address.ToString();
|
||||
var isOnline = member.Status == MemberStatus.Up;
|
||||
var isLeader = member.Address.Equals(leader);
|
||||
var role = isLeader ? "Primary" : "Standby";
|
||||
|
||||
nodes.Add(new NodeStatus(hostname, isOnline, role));
|
||||
}
|
||||
|
||||
// If we have unreachable members, add them as offline
|
||||
foreach (var unreachable in cluster.State.Unreachable)
|
||||
{
|
||||
if (!unreachable.HasRole(_siteRole))
|
||||
continue;
|
||||
|
||||
// Don't duplicate if already in members list
|
||||
if (nodes.Any(n => n.Hostname == (unreachable.Address.Host ?? unreachable.Address.ToString())))
|
||||
continue;
|
||||
|
||||
var hostname = unreachable.Address.Host ?? unreachable.Address.ToString();
|
||||
nodes.Add(new NodeStatus(hostname, false, "Standby"));
|
||||
}
|
||||
|
||||
return nodes;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,43 @@
|
||||
using Microsoft.Extensions.Diagnostics.HealthChecks;
|
||||
using ZB.MOM.WW.ScadaBridge.ConfigurationDatabase;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.Host.Health;
|
||||
|
||||
/// <summary>
|
||||
/// Health check that verifies database connectivity for Central nodes.
|
||||
/// </summary>
|
||||
public class DatabaseHealthCheck : IHealthCheck
|
||||
{
|
||||
private readonly ScadaBridgeDbContext _dbContext;
|
||||
|
||||
/// <summary>
|
||||
/// Initializes a new <see cref="DatabaseHealthCheck"/>.
|
||||
/// </summary>
|
||||
/// <param name="dbContext">The EF Core database context used to test connectivity.</param>
|
||||
public DatabaseHealthCheck(ScadaBridgeDbContext dbContext)
|
||||
{
|
||||
_dbContext = dbContext;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Checks database connectivity by attempting to open a connection.
|
||||
/// </summary>
|
||||
/// <param name="context">Health check context providing failure status information.</param>
|
||||
/// <param name="cancellationToken">Cancellation token for the check.</param>
|
||||
public async Task<HealthCheckResult> CheckHealthAsync(
|
||||
HealthCheckContext context,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
try
|
||||
{
|
||||
var canConnect = await _dbContext.Database.CanConnectAsync(cancellationToken);
|
||||
return canConnect
|
||||
? HealthCheckResult.Healthy("Database connection is available.")
|
||||
: HealthCheckResult.Unhealthy("Database connection failed.");
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
return HealthCheckResult.Unhealthy("Database connection failed.", ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,126 @@
|
||||
using Serilog;
|
||||
using Serilog.Events;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.Host;
|
||||
|
||||
/// <summary>
|
||||
/// Builds the Serilog <see cref="LoggerConfiguration"/> for the Host process.
|
||||
///
|
||||
/// REQ-HOST-8 / Host-011: the configured minimum level comes from
|
||||
/// <c>ScadaBridge:Logging:MinimumLevel</c> (bound to <see cref="LoggingOptions"/>) so an
|
||||
/// operator editing that key changes the effective log level.
|
||||
///
|
||||
/// REQ-HOST-8 / Host-014: the console and file sinks are read from the standard
|
||||
/// <c>Serilog</c> configuration section via <c>ReadFrom.Configuration</c> — the sink
|
||||
/// set, console output template, file path and rolling interval are all
|
||||
/// configuration-driven (defined in <c>appsettings.json</c>), not hard-coded. The
|
||||
/// explicit <c>MinimumLevel.Is</c> below pins the floor from <see cref="LoggingOptions"/>.
|
||||
///
|
||||
/// Host-020: <c>ScadaBridge:Logging:MinimumLevel</c> is the single source of truth
|
||||
/// for the floor — the explicit <c>MinimumLevel.Is</c> call deliberately runs
|
||||
/// AFTER <c>ReadFrom.Configuration</c> so a <c>Serilog:MinimumLevel</c> entry in
|
||||
/// configuration is overridden. To make that precedence visible (so an operator
|
||||
/// who sets <c>Serilog:MinimumLevel</c> does not wonder why the change had no
|
||||
/// effect), <see cref="Build"/> writes a one-shot warning to
|
||||
/// <see cref="Console.Error"/> when both keys are present. Pick one path —
|
||||
/// editing <c>Serilog:MinimumLevel</c> alone has no effect.
|
||||
/// </summary>
|
||||
public static class LoggerConfigurationFactory
|
||||
{
|
||||
/// <summary>Builds a <see cref="LoggerConfiguration"/> enriched with node-identity properties and a configured minimum level.</summary>
|
||||
/// <param name="configuration">Application configuration supplying the Serilog section and logging options.</param>
|
||||
/// <param name="nodeRole">Role label (e.g., <c>central-a</c>) added as a log enrichment property.</param>
|
||||
/// <param name="siteId">Site identifier added as a log enrichment property.</param>
|
||||
/// <param name="nodeHostname">Hostname added as a log enrichment property.</param>
|
||||
/// <returns>The configured <see cref="LoggerConfiguration"/>.</returns>
|
||||
public static LoggerConfiguration Build(
|
||||
IConfiguration configuration,
|
||||
string nodeRole,
|
||||
string siteId,
|
||||
string nodeHostname)
|
||||
=> Build(configuration, nodeRole, siteId, nodeHostname, Console.Error);
|
||||
|
||||
/// <summary>
|
||||
/// Test-visible overload of <see cref="Build(IConfiguration, string, string, string)"/>
|
||||
/// that routes the Host-020 precedence warning through a caller-supplied
|
||||
/// writer so unit tests can capture it. Production calls the four-arg
|
||||
/// overload which uses <see cref="Console.Error"/>.
|
||||
/// </summary>
|
||||
/// <param name="configuration">Application configuration supplying the Serilog section and logging options.</param>
|
||||
/// <param name="nodeRole">Role label added as a log enrichment property.</param>
|
||||
/// <param name="siteId">Site identifier added as a log enrichment property.</param>
|
||||
/// <param name="nodeHostname">Hostname added as a log enrichment property.</param>
|
||||
/// <param name="warningWriter">Writer that receives the one-shot Host-020 override-warning when both keys are present.</param>
|
||||
internal static LoggerConfiguration Build(
|
||||
IConfiguration configuration,
|
||||
string nodeRole,
|
||||
string siteId,
|
||||
string nodeHostname,
|
||||
TextWriter warningWriter)
|
||||
{
|
||||
var loggingOptions = new LoggingOptions();
|
||||
configuration.GetSection("ScadaBridge:Logging").Bind(loggingOptions);
|
||||
|
||||
var minimumLevel = ParseLevel(loggingOptions.MinimumLevel, warningWriter);
|
||||
|
||||
// Host-020: warn once if the operator also set a Serilog:MinimumLevel —
|
||||
// they almost certainly expected it to take effect, but the explicit
|
||||
// MinimumLevel.Is call below silently overrides it. The warning is
|
||||
// emitted only when the conflicting key is actually present (a bare
|
||||
// "Default" value is what ReadFrom.Configuration reads); a missing /
|
||||
// empty Serilog:MinimumLevel section is silent.
|
||||
var serilogMinimumLevel = configuration["Serilog:MinimumLevel"]
|
||||
?? configuration["Serilog:MinimumLevel:Default"];
|
||||
if (!string.IsNullOrWhiteSpace(serilogMinimumLevel))
|
||||
{
|
||||
warningWriter.WriteLine(
|
||||
$"warning: Serilog:MinimumLevel ('{serilogMinimumLevel}') is being overridden by " +
|
||||
$"ScadaBridge:Logging:MinimumLevel ('{loggingOptions.MinimumLevel ?? "Information (default)"}'). " +
|
||||
"ScadaBridge:Logging:MinimumLevel is the documented source of truth for the floor (Host-011); " +
|
||||
"remove the Serilog:MinimumLevel entry to silence this warning.");
|
||||
}
|
||||
|
||||
return new LoggerConfiguration()
|
||||
.ReadFrom.Configuration(configuration)
|
||||
.MinimumLevel.Is(minimumLevel)
|
||||
.Enrich.WithProperty("SiteId", siteId)
|
||||
.Enrich.WithProperty("NodeHostname", nodeHostname)
|
||||
.Enrich.WithProperty("NodeRole", nodeRole);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Parses a Serilog <see cref="LogEventLevel"/> name, falling back to
|
||||
/// <see cref="LogEventLevel.Information"/> for null/blank/unrecognised values.
|
||||
///
|
||||
/// Host-022: when an operator sets <c>ScadaBridge:Logging:MinimumLevel</c> to a
|
||||
/// value that doesn't parse (e.g. the typo "Informaiton"), the helper must NOT
|
||||
/// throw — startup has to succeed so the rest of the system can come up — but
|
||||
/// it MUST make the silent fallback visible. The logger is not yet built at
|
||||
/// this point, so the warning is written directly to <see cref="Console.Error"/>
|
||||
/// using <see cref="WriteParseWarning"/>; non-null/non-blank values that fail
|
||||
/// to parse are reported once, naming the offending value and the fallback.
|
||||
/// Null/blank values are treated as "unset" and silently default — only
|
||||
/// explicit-but-invalid values trigger the warning.
|
||||
/// </summary>
|
||||
internal static LogEventLevel ParseLevel(string? level)
|
||||
=> ParseLevel(level, Console.Error);
|
||||
|
||||
/// <summary>
|
||||
/// Test-visible overload of <see cref="ParseLevel(string?)"/> that routes the
|
||||
/// one-shot warning through a caller-supplied writer (<see cref="Console.Error"/>
|
||||
/// in production) so unit tests can capture the warning output.
|
||||
/// </summary>
|
||||
/// <param name="level">Configured level string, possibly null/blank/invalid.</param>
|
||||
/// <param name="warningWriter">Writer that receives a single warning line if the value is non-blank but unparseable.</param>
|
||||
internal static LogEventLevel ParseLevel(string? level, TextWriter warningWriter)
|
||||
{
|
||||
if (Enum.TryParse<LogEventLevel>(level, ignoreCase: true, out var parsed))
|
||||
return parsed;
|
||||
|
||||
if (!string.IsNullOrWhiteSpace(level))
|
||||
warningWriter.WriteLine(
|
||||
$"warning: ScadaBridge:Logging:MinimumLevel value '{level}' is not a recognised Serilog LogEventLevel; falling back to Information.");
|
||||
|
||||
return LogEventLevel.Information;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,7 @@
|
||||
namespace ZB.MOM.WW.ScadaBridge.Host;
|
||||
|
||||
public class LoggingOptions
|
||||
{
|
||||
/// <summary>Gets or sets the minimum log level (e.g. "Trace", "Debug", "Information", "Warning", "Error").</summary>
|
||||
public string MinimumLevel { get; set; } = "Information";
|
||||
}
|
||||
@@ -0,0 +1,25 @@
|
||||
using Microsoft.Extensions.Options;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Services;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.Host;
|
||||
|
||||
/// <summary>
|
||||
/// Binds <see cref="INodeIdentityProvider"/> to <see cref="NodeOptions.NodeName"/>.
|
||||
/// Empty or whitespace values are normalised to <c>null</c>; otherwise the value
|
||||
/// is returned trimmed.
|
||||
/// </summary>
|
||||
internal sealed class NodeIdentityProvider : INodeIdentityProvider
|
||||
{
|
||||
/// <inheritdoc />
|
||||
public string? NodeName { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Initializes a new <see cref="NodeIdentityProvider"/> from the given node options.
|
||||
/// </summary>
|
||||
/// <param name="nodeOptions">Node options whose <see cref="NodeOptions.NodeName"/> is normalised.</param>
|
||||
public NodeIdentityProvider(IOptions<NodeOptions> nodeOptions)
|
||||
{
|
||||
var configured = nodeOptions.Value.NodeName;
|
||||
NodeName = string.IsNullOrWhiteSpace(configured) ? null : configured.Trim();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,23 @@
|
||||
namespace ZB.MOM.WW.ScadaBridge.Host;
|
||||
|
||||
public class NodeOptions
|
||||
{
|
||||
/// <summary>Gets or sets the node role (e.g. Central or Site).</summary>
|
||||
public string Role { get; set; } = string.Empty;
|
||||
/// <summary>Gets or sets the hostname or IP address this node advertises to the Akka cluster.</summary>
|
||||
public string NodeHostname { get; set; } = string.Empty;
|
||||
|
||||
/// <summary>
|
||||
/// Operator-configured semantic node name used to stamp the SourceNode
|
||||
/// column on audit rows. Conventional values are <c>node-a</c>/<c>node-b</c>
|
||||
/// on site nodes and <c>central-a</c>/<c>central-b</c> on central nodes,
|
||||
/// but the value is a free-form label — no validation is enforced.
|
||||
/// </summary>
|
||||
public string NodeName { get; set; } = string.Empty;
|
||||
/// <summary>Gets or sets the site identifier for site nodes; null for central nodes.</summary>
|
||||
public string? SiteId { get; set; }
|
||||
/// <summary>Gets or sets the Akka.NET remoting port for cluster communication.</summary>
|
||||
public int RemotingPort { get; set; } = 8081;
|
||||
/// <summary>Gets or sets the gRPC port for the site stream server.</summary>
|
||||
public int GrpcPort { get; set; } = 8083;
|
||||
}
|
||||
@@ -0,0 +1,326 @@
|
||||
using HealthChecks.UI.Client;
|
||||
using Microsoft.AspNetCore.Diagnostics.HealthChecks;
|
||||
using ZB.MOM.WW.ScadaBridge.AuditLog;
|
||||
using ZB.MOM.WW.ScadaBridge.CentralUI;
|
||||
using ZB.MOM.WW.ScadaBridge.ClusterInfrastructure;
|
||||
using ZB.MOM.WW.ScadaBridge.Communication;
|
||||
using ZB.MOM.WW.ScadaBridge.ConfigurationDatabase;
|
||||
using ZB.MOM.WW.ScadaBridge.DeploymentManager;
|
||||
using ZB.MOM.WW.ScadaBridge.ExternalSystemGateway;
|
||||
using ZB.MOM.WW.ScadaBridge.HealthMonitoring;
|
||||
using ZB.MOM.WW.ScadaBridge.Host;
|
||||
using ZB.MOM.WW.ScadaBridge.Host.Actors;
|
||||
using ZB.MOM.WW.ScadaBridge.Host.Health;
|
||||
using ZB.MOM.WW.ScadaBridge.InboundAPI;
|
||||
using ZB.MOM.WW.ScadaBridge.InboundAPI.Middleware;
|
||||
using ZB.MOM.WW.ScadaBridge.ManagementService;
|
||||
using ZB.MOM.WW.ScadaBridge.NotificationOutbox;
|
||||
using ZB.MOM.WW.ScadaBridge.NotificationService;
|
||||
using ZB.MOM.WW.ScadaBridge.Security;
|
||||
using ZB.MOM.WW.ScadaBridge.SiteCallAudit;
|
||||
using ZB.MOM.WW.ScadaBridge.TemplateEngine;
|
||||
using ZB.MOM.WW.ScadaBridge.Transport;
|
||||
using Serilog;
|
||||
|
||||
// SCADALINK_CONFIG determines which role-specific config to load (Central or Site)
|
||||
// DOTNET_ENVIRONMENT/ASPNETCORE_ENVIRONMENT stay as "Development" for dev tooling (static assets, EF migrations, etc.)
|
||||
var scadabridgeConfig = Environment.GetEnvironmentVariable("SCADALINK_CONFIG")
|
||||
?? Environment.GetEnvironmentVariable("DOTNET_ENVIRONMENT")
|
||||
?? "Production";
|
||||
|
||||
var configuration = new ConfigurationBuilder()
|
||||
.AddJsonFile("appsettings.json", optional: false)
|
||||
.AddJsonFile($"appsettings.{scadabridgeConfig}.json", optional: true)
|
||||
.AddEnvironmentVariables()
|
||||
.AddCommandLine(args)
|
||||
.Build();
|
||||
|
||||
// WP-11: Full startup validation — fail fast before any DI or actor system setup
|
||||
StartupValidator.Validate(configuration);
|
||||
|
||||
// Read node options for Serilog enrichment
|
||||
var nodeRole = configuration["ScadaBridge:Node:Role"]!;
|
||||
var nodeHostname = configuration["ScadaBridge:Node:NodeHostname"] ?? "unknown";
|
||||
var siteId = configuration["ScadaBridge:Node:SiteId"] ?? "central";
|
||||
|
||||
// WP-14: Serilog structured logging.
|
||||
// Host-011: minimum level is driven by ScadaBridge:Logging:MinimumLevel (LoggingOptions).
|
||||
// Host-014: console and file sinks are defined in the `Serilog` configuration
|
||||
// section (appsettings.json) and applied via ReadFrom.Configuration inside the
|
||||
// factory — the sink set, output template, file path and rolling interval are all
|
||||
// configuration-driven per REQ-HOST-8, not hard-coded here.
|
||||
Log.Logger = ZB.MOM.WW.ScadaBridge.Host.LoggerConfigurationFactory
|
||||
.Build(configuration, nodeRole, siteId, nodeHostname)
|
||||
.CreateLogger();
|
||||
|
||||
try
|
||||
{
|
||||
Log.Information("Starting ScadaBridge host as {Role} on {Hostname}", nodeRole, nodeHostname);
|
||||
|
||||
if (nodeRole.Equals("Central", StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
var builder = WebApplication.CreateBuilder(args);
|
||||
builder.Configuration.AddConfiguration(configuration);
|
||||
|
||||
// WP-14: Serilog
|
||||
builder.Host.UseSerilog();
|
||||
|
||||
// WP-17: Windows Service support (no-op when not running as a Windows Service)
|
||||
builder.Host.UseWindowsService();
|
||||
|
||||
// Shared components
|
||||
builder.Services.AddClusterInfrastructure();
|
||||
builder.Services.AddCommunication();
|
||||
builder.Services.AddHealthMonitoring();
|
||||
builder.Services.AddCentralHealthAggregation();
|
||||
builder.Services.AddExternalSystemGateway();
|
||||
builder.Services.AddNotificationService();
|
||||
|
||||
// Central-only components
|
||||
// Notification Outbox: central owns SMTP delivery; the Email adapter reuses the
|
||||
// AddNotificationService() SMTP machinery above. AddNotificationOutbox binds
|
||||
// NotificationOutboxOptions via BindConfiguration, so no explicit Configure is needed.
|
||||
builder.Services.AddNotificationOutbox();
|
||||
// Transport (#24) — central-only bundle export/import pipeline. Binds
|
||||
// TransportOptions from ScadaBridge:Transport via BindConfiguration; no
|
||||
// explicit Configure needed.
|
||||
builder.Services.AddTransport();
|
||||
// Audit Log (#23) — central node owns the AuditLogIngestActor singleton +
|
||||
// IAuditLogRepository. The site writer chain is still registered (lazy
|
||||
// singletons) but is never resolved on a central node.
|
||||
builder.Services.AddAuditLog(builder.Configuration);
|
||||
// #23 M6-T5 Bundle D — central-only hosted service that rolls
|
||||
// pf_AuditLog_Month forward monthly. Depends on IPartitionMaintenance
|
||||
// (registered below by AddConfigurationDatabase).
|
||||
builder.Services.AddAuditLogCentralMaintenance(builder.Configuration);
|
||||
// Site Call Audit (#22) — central node owns the SiteCallAuditActor
|
||||
// singleton (M3 Bundle F). The extension itself currently registers
|
||||
// nothing — actor Props are constructed inline in AkkaHostedService —
|
||||
// but the call is here for symmetry with the other audit composition
|
||||
// roots so future per-actor DI lands without touching Program.cs.
|
||||
builder.Services.AddSiteCallAudit();
|
||||
builder.Services.AddTemplateEngine();
|
||||
builder.Services.AddDeploymentManager();
|
||||
builder.Services.AddSecurity();
|
||||
builder.Services.AddCentralUI();
|
||||
builder.Services.AddInboundAPI();
|
||||
builder.Services.AddManagementService();
|
||||
|
||||
var configDbConnectionString = configuration["ScadaBridge:Database:ConfigurationDb"]
|
||||
?? throw new InvalidOperationException("ScadaBridge:Database:ConfigurationDb connection string is required for Central role.");
|
||||
builder.Services.AddConfigurationDatabase(configDbConnectionString);
|
||||
|
||||
// WP-12: Health checks for readiness gating
|
||||
builder.Services.AddHealthChecks()
|
||||
.AddCheck<DatabaseHealthCheck>("database")
|
||||
.AddCheck<AkkaClusterHealthCheck>("akka-cluster")
|
||||
.AddCheck<ActiveNodeHealthCheck>("active-node");
|
||||
|
||||
// WP-13: Akka.NET bootstrap via hosted service
|
||||
builder.Services.AddSingleton<AkkaHostedService>();
|
||||
builder.Services.AddHostedService(sp => sp.GetRequiredService<AkkaHostedService>());
|
||||
|
||||
// InboundAPI-022: register the production IActiveNodeGate implementation so
|
||||
// standby-node gating is actually enforced (the InboundApiEndpointFilter
|
||||
// consults IActiveNodeGate and defaults to "allow" when none is registered,
|
||||
// which leaves the design's "central cluster only (active node)" guarantee
|
||||
// unenforced in deployed binaries). The gate is backed by the same Akka
|
||||
// cluster-leadership check as ActiveNodeHealthCheck above, so the inbound
|
||||
// API and the /health/active endpoint Traefik routes against agree on
|
||||
// which node is active.
|
||||
builder.Services.AddSingleton<ZB.MOM.WW.ScadaBridge.InboundAPI.IActiveNodeGate, ActiveNodeGate>();
|
||||
|
||||
// Cluster node status provider scoped to the Central role — feeds the
|
||||
// CentralHealthReportLoop so the central cluster appears on /monitoring/health.
|
||||
builder.Services.AddSingleton<IClusterNodeProvider>(sp =>
|
||||
{
|
||||
var akkaService = sp.GetRequiredService<AkkaHostedService>();
|
||||
return new AkkaClusterNodeProvider(akkaService, "Central");
|
||||
});
|
||||
|
||||
// Options binding
|
||||
SiteServiceRegistration.BindSharedOptions(builder.Services, builder.Configuration);
|
||||
builder.Services.Configure<SecurityOptions>(builder.Configuration.GetSection("ScadaBridge:Security"));
|
||||
builder.Services.Configure<InboundApiOptions>(builder.Configuration.GetSection("ScadaBridge:InboundApi"));
|
||||
builder.Services.Configure<DeploymentManagerOptions>(
|
||||
builder.Configuration.GetSection(ZB.MOM.WW.ScadaBridge.DeploymentManager.ServiceCollectionExtensions.OptionsSection));
|
||||
|
||||
var app = builder.Build();
|
||||
|
||||
// Apply or validate database migrations (skip when running in test harness)
|
||||
if (!string.Equals(configuration["ScadaBridge:Database:SkipMigrations"], "true", StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
var isDevelopment = app.Environment.IsDevelopment()
|
||||
|| string.Equals(Environment.GetEnvironmentVariable("ASPNETCORE_ENVIRONMENT"), "Development", StringComparison.OrdinalIgnoreCase);
|
||||
var migrationLogger = app.Services
|
||||
.GetRequiredService<ILoggerFactory>()
|
||||
.CreateLogger(typeof(MigrationHelper).FullName!);
|
||||
|
||||
// Host-010: tolerate a database that is briefly unreachable at boot
|
||||
// (e.g. app and DB containers starting together) with a bounded
|
||||
// exponential backoff before failing fatally.
|
||||
// Host-015: only connection-class (transient) faults are retried — a
|
||||
// schema-version mismatch is permanent and must fail fast on attempt 1.
|
||||
// Host-019: thread the host's ApplicationStopping token into both the
|
||||
// migration call itself and the inter-attempt Task.Delay so a SIGTERM
|
||||
// during the bounded-retry window (~2 min worst-case) tears down
|
||||
// cleanly instead of being ignored until the loop exhausts.
|
||||
await StartupRetry.ExecuteWithRetryAsync(
|
||||
"database-migration",
|
||||
async ct =>
|
||||
{
|
||||
using var scope = app.Services.CreateScope();
|
||||
var dbContext = scope.ServiceProvider.GetRequiredService<ScadaBridgeDbContext>();
|
||||
await MigrationHelper.ApplyOrValidateMigrationsAsync(dbContext, isDevelopment, migrationLogger, ct);
|
||||
},
|
||||
maxAttempts: 8,
|
||||
initialDelay: TimeSpan.FromSeconds(2),
|
||||
migrationLogger,
|
||||
isTransient: StartupRetry.IsTransientDatabaseFault,
|
||||
cancellationToken: app.Lifetime.ApplicationStopping);
|
||||
}
|
||||
|
||||
// Middleware pipeline
|
||||
app.UseWebSockets();
|
||||
app.UseRouting();
|
||||
app.UseAuthentication();
|
||||
app.UseAuthorization();
|
||||
app.UseAntiforgery();
|
||||
|
||||
// Audit Log #23 (M4 Bundle D, T8): emit one InboundRequest/InboundAuthFailure
|
||||
// audit row per call into the inbound API. Placed AFTER UseAuthentication/
|
||||
// UseAuthorization so any HttpContext.User the framework populates is in
|
||||
// place, and scoped to the /api/ prefix so it never observes the Central UI,
|
||||
// Management API, SignalR hubs, or health endpoints. The endpoint handler
|
||||
// is responsible for stashing the resolved API key name on
|
||||
// HttpContext.Items (see AuditWriteMiddleware.AuditActorItemKey) AFTER its
|
||||
// in-handler API key validation succeeds.
|
||||
// InboundAPI-025: scope the audit middleware to the inbound API method
|
||||
// route (/api/{methodName}) and explicitly exclude the management/audit
|
||||
// sub-trees that share the /api prefix. Without these exclusions the
|
||||
// middleware would emit a spurious ApiInbound audit row for every
|
||||
// /api/audit/query and /api/audit/export call (and would treat audit-log
|
||||
// reads as inbound script invocations — recursive write-on-read). The
|
||||
// POST-only filter rules out the GET routes on /api/audit, /api/centralui,
|
||||
// /api/script-analysis even if a future route is added under those
|
||||
// prefixes with the same verb; the explicit prefix excludes still belt-
|
||||
// and-brace POST-y additions there.
|
||||
app.UseWhen(
|
||||
ctx => ctx.Request.Path.StartsWithSegments("/api")
|
||||
&& !ctx.Request.Path.StartsWithSegments("/api/audit")
|
||||
&& !ctx.Request.Path.StartsWithSegments("/api/centralui")
|
||||
&& !ctx.Request.Path.StartsWithSegments("/api/script-analysis")
|
||||
&& !ctx.Request.Path.StartsWithSegments("/api/management")
|
||||
&& HttpMethods.IsPost(ctx.Request.Method),
|
||||
branch => branch.UseAuditWriteMiddleware());
|
||||
|
||||
// WP-12: Map readiness endpoint — returns 503 until ready, 200 when ready.
|
||||
// REQ-HOST-4a defines readiness as cluster membership + DB connectivity,
|
||||
// explicitly NOT cluster leadership. The leader-only "active-node" check is
|
||||
// excluded here so a fully operational standby central node reports ready;
|
||||
// leadership is reported separately on /health/active.
|
||||
app.MapHealthChecks("/health/ready", new HealthCheckOptions
|
||||
{
|
||||
Predicate = check => check.Name != "active-node",
|
||||
ResponseWriter = UIResponseWriter.WriteHealthCheckUIResponse
|
||||
});
|
||||
|
||||
// Active node endpoint — returns 200 only on the cluster leader; used by Traefik for routing
|
||||
app.MapHealthChecks("/health/active", new HealthCheckOptions
|
||||
{
|
||||
Predicate = check => check.Name == "active-node",
|
||||
ResponseWriter = UIResponseWriter.WriteHealthCheckUIResponse
|
||||
});
|
||||
|
||||
app.MapStaticAssets();
|
||||
app.MapCentralUI<ZB.MOM.WW.ScadaBridge.Host.Components.App>();
|
||||
app.MapInboundAPI();
|
||||
app.MapManagementAPI();
|
||||
// Audit Log #23 (M8): CLI-facing /api/audit/{query,export} routes. Same
|
||||
// Basic-Auth + LDAP mechanism as /management; gated on the OperationalAudit
|
||||
// / AuditExport role sets.
|
||||
app.MapAuditAPI();
|
||||
app.MapHub<ZB.MOM.WW.ScadaBridge.ManagementService.DebugStreamHub>("/hubs/debug-stream");
|
||||
|
||||
// Compile and register all Inbound API method scripts at startup
|
||||
using (var scope = app.Services.CreateScope())
|
||||
{
|
||||
var apiRepo = scope.ServiceProvider.GetRequiredService<ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Repositories.IInboundApiRepository>();
|
||||
var executor = app.Services.GetRequiredService<ZB.MOM.WW.ScadaBridge.InboundAPI.InboundScriptExecutor>();
|
||||
var methods = await apiRepo.GetAllApiMethodsAsync();
|
||||
foreach (var method in methods)
|
||||
{
|
||||
executor.CompileAndRegister(method);
|
||||
}
|
||||
}
|
||||
|
||||
await app.RunAsync();
|
||||
}
|
||||
else if (nodeRole.Equals("Site", StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
var builder = WebApplication.CreateBuilder(args);
|
||||
builder.Configuration.AddConfiguration(configuration);
|
||||
|
||||
// WP-14: Serilog
|
||||
builder.Host.UseSerilog();
|
||||
|
||||
// WP-17: Windows Service support (no-op when not running as a Windows Service)
|
||||
builder.Host.UseWindowsService();
|
||||
|
||||
// Read GrpcPort from config (NodeOptions already has default 8083)
|
||||
var grpcPort = configuration.GetValue<int>("ScadaBridge:Node:GrpcPort", 8083);
|
||||
|
||||
// Configure Kestrel for HTTP/2 only on the gRPC port
|
||||
builder.WebHost.ConfigureKestrel(options =>
|
||||
{
|
||||
options.ListenAnyIP(grpcPort, listenOptions =>
|
||||
{
|
||||
listenOptions.Protocols = Microsoft.AspNetCore.Server.Kestrel.Core.HttpProtocols.Http2;
|
||||
});
|
||||
});
|
||||
|
||||
// gRPC server registration
|
||||
builder.Services.AddGrpc();
|
||||
builder.Services.AddSingleton<ZB.MOM.WW.ScadaBridge.Communication.Grpc.SiteStreamGrpcServer>();
|
||||
|
||||
// Existing site service registrations
|
||||
SiteServiceRegistration.Configure(builder.Services, builder.Configuration);
|
||||
|
||||
var app = builder.Build();
|
||||
|
||||
// Map gRPC service — resolves the singleton SiteStreamGrpcServer from DI
|
||||
app.MapGrpcService<ZB.MOM.WW.ScadaBridge.Communication.Grpc.SiteStreamGrpcServer>();
|
||||
|
||||
// Host-017 / REQ-HOST-7: site-shutdown ordering. ApplicationStopping
|
||||
// fires BEFORE IHostedService.StopAsync runs, so the gRPC server
|
||||
// refuses new streams (Unavailable) and cancels every active stream
|
||||
// here — clients observe a clean Cancelled and reconnect — and only
|
||||
// THEN does AkkaHostedService run CoordinatedShutdown and tear down
|
||||
// actors. Without this hand-off, in-flight streams go silent and only
|
||||
// time out via gRPC keepalive (~25 s), violating the documented
|
||||
// four-step sequence.
|
||||
var siteLifetime = app.Services.GetRequiredService<Microsoft.Extensions.Hosting.IHostApplicationLifetime>();
|
||||
var siteGrpcServer = app.Services.GetRequiredService<ZB.MOM.WW.ScadaBridge.Communication.Grpc.SiteStreamGrpcServer>();
|
||||
siteLifetime.ApplicationStopping.Register(() => siteGrpcServer.CancelAllStreams());
|
||||
|
||||
await app.RunAsync();
|
||||
}
|
||||
else
|
||||
{
|
||||
throw new InvalidOperationException($"Unknown role: {nodeRole}. Must be 'Central' or 'Site'.");
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
Log.Fatal(ex, "ScadaBridge host terminated unexpectedly");
|
||||
throw;
|
||||
}
|
||||
finally
|
||||
{
|
||||
await Log.CloseAndFlushAsync();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Exposes the auto-generated Program class for test infrastructure (e.g. WebApplicationFactory).
|
||||
/// </summary>
|
||||
public partial class Program { }
|
||||
@@ -0,0 +1,26 @@
|
||||
{
|
||||
"profiles": {
|
||||
"ScadaBridge Central": {
|
||||
"commandName": "Project",
|
||||
"dotnetRunMessages": true,
|
||||
"launchBrowser": true,
|
||||
"launchUrl": "",
|
||||
"applicationUrl": "https://localhost:5001;http://localhost:5000",
|
||||
"environmentVariables": {
|
||||
"DOTNET_ENVIRONMENT": "Development",
|
||||
"ASPNETCORE_ENVIRONMENT": "Development",
|
||||
"SCADALINK_CONFIG": "Central"
|
||||
}
|
||||
},
|
||||
"ScadaBridge Site": {
|
||||
"commandName": "Project",
|
||||
"dotnetRunMessages": true,
|
||||
"launchBrowser": false,
|
||||
"environmentVariables": {
|
||||
"DOTNET_ENVIRONMENT": "Development",
|
||||
"ASPNETCORE_ENVIRONMENT": "Development",
|
||||
"SCADALINK_CONFIG": "Site"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,23 @@
|
||||
using Microsoft.Extensions.Options;
|
||||
using ZB.MOM.WW.ScadaBridge.HealthMonitoring;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.Host;
|
||||
|
||||
/// <summary>
|
||||
/// Provides the site identity from NodeOptions configuration.
|
||||
/// </summary>
|
||||
public class SiteIdentityProvider : ISiteIdentityProvider
|
||||
{
|
||||
/// <inheritdoc />
|
||||
public string SiteId { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Initializes the provider by reading the site id from node options.
|
||||
/// </summary>
|
||||
/// <param name="nodeOptions">Node configuration containing the site id.</param>
|
||||
public SiteIdentityProvider(IOptions<NodeOptions> nodeOptions)
|
||||
{
|
||||
SiteId = nodeOptions.Value.SiteId
|
||||
?? throw new InvalidOperationException("SiteId is required for site nodes.");
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,111 @@
|
||||
using ZB.MOM.WW.ScadaBridge.AuditLog;
|
||||
using ZB.MOM.WW.ScadaBridge.ClusterInfrastructure;
|
||||
using ZB.MOM.WW.ScadaBridge.Communication;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Services;
|
||||
using ZB.MOM.WW.ScadaBridge.DataConnectionLayer;
|
||||
using ZB.MOM.WW.ScadaBridge.ExternalSystemGateway;
|
||||
using ZB.MOM.WW.ScadaBridge.HealthMonitoring;
|
||||
using ZB.MOM.WW.ScadaBridge.Host.Actors;
|
||||
using ZB.MOM.WW.ScadaBridge.Host.Health;
|
||||
using ZB.MOM.WW.ScadaBridge.NotificationService;
|
||||
using ZB.MOM.WW.ScadaBridge.SiteEventLogging;
|
||||
using ZB.MOM.WW.ScadaBridge.SiteRuntime;
|
||||
using ZB.MOM.WW.ScadaBridge.StoreAndForward;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.Host;
|
||||
|
||||
/// <summary>
|
||||
/// Extracted site-role DI registrations so both Program.cs and tests
|
||||
/// use the same composition root.
|
||||
/// </summary>
|
||||
public static class SiteServiceRegistration
|
||||
{
|
||||
/// <summary>Registers all DI services required for the site role.</summary>
|
||||
/// <param name="services">The service collection to register into.</param>
|
||||
/// <param name="config">Application configuration for options binding.</param>
|
||||
public static void Configure(IServiceCollection services, IConfiguration config)
|
||||
{
|
||||
// Shared components
|
||||
services.AddClusterInfrastructure();
|
||||
services.AddCommunication();
|
||||
services.AddSiteHealthMonitoring();
|
||||
services.AddExternalSystemGateway();
|
||||
// AddNotificationService() is intentionally NOT registered on the site path.
|
||||
// Sites no longer deliver notifications over SMTP — a buffered notification is
|
||||
// forwarded to the central cluster (via NotificationForwarder / SiteCommunicationActor),
|
||||
// and central owns SMTP delivery through the Notification Outbox. The SMTP machinery
|
||||
// (OAuth2TokenService, ISmtpClientWrapper) has no consumer on a site node.
|
||||
|
||||
// Health report transport: sends SiteHealthReport to SiteCommunicationActor via Akka
|
||||
services.AddSingleton<ISiteIdentityProvider, SiteIdentityProvider>();
|
||||
services.AddSingleton<IHealthReportTransport, AkkaHealthReportTransport>();
|
||||
|
||||
// Site-only components — AddSiteRuntime registers SiteStorageService with SQLite path
|
||||
// and site-local repository implementations (IExternalSystemRepository, INotificationRepository)
|
||||
var siteDbPath = config["ScadaBridge:Database:SiteDbPath"] ?? "site.db";
|
||||
services.AddSiteRuntime($"Data Source={siteDbPath}");
|
||||
services.AddDataConnectionLayer();
|
||||
// Audit Log #23 (M3 Bundle F): adapter that surfaces the site id to
|
||||
// StoreAndForwardService through DI WITHOUT introducing a
|
||||
// StoreAndForward → HealthMonitoring project-reference cycle. Must be
|
||||
// registered BEFORE AddStoreAndForward so the S&F factory resolves a
|
||||
// non-empty SiteId at construction time (otherwise the S&F service is
|
||||
// a singleton and the empty-string value would be cached for the
|
||||
// lifetime of the process).
|
||||
services.AddSingleton<ZB.MOM.WW.ScadaBridge.StoreAndForward.IStoreAndForwardSiteContext, StoreAndForwardSiteContext>();
|
||||
services.AddStoreAndForward();
|
||||
services.AddSiteEventLogging();
|
||||
|
||||
// Audit Log (#23) — site-side hot-path writer + telemetry collaborators.
|
||||
// The SiteAuditTelemetryActor itself is registered by AkkaHostedService
|
||||
// in the site-role block; this call wires every DI dependency it (and
|
||||
// ScriptRuntimeContext, when Bundle F lands) reaches for.
|
||||
services.AddAuditLog(config);
|
||||
|
||||
// Audit Log (#23) M2 Bundle G — bridge FallbackAuditWriter primary
|
||||
// failures into the site health report payload as
|
||||
// SiteAuditWriteFailures. Must come AFTER both AddSiteHealthMonitoring
|
||||
// (registers ISiteHealthCollector) and AddAuditLog (registers the
|
||||
// NoOp default this call replaces).
|
||||
services.AddAuditLogHealthMetricsBridge();
|
||||
|
||||
// WP-13: Akka.NET bootstrap via hosted service
|
||||
services.AddSingleton<AkkaHostedService>();
|
||||
services.AddHostedService(sp => sp.GetRequiredService<AkkaHostedService>());
|
||||
|
||||
// Cluster node status provider for health reports
|
||||
services.AddSingleton<IClusterNodeProvider>(sp =>
|
||||
{
|
||||
var akkaService = sp.GetRequiredService<AkkaHostedService>();
|
||||
var nodeOptions = sp.GetRequiredService<Microsoft.Extensions.Options.IOptions<NodeOptions>>().Value;
|
||||
var siteRole = $"site-{nodeOptions.SiteId}";
|
||||
return new AkkaClusterNodeProvider(akkaService, siteRole);
|
||||
});
|
||||
|
||||
// Options binding
|
||||
BindSharedOptions(services, config);
|
||||
services.Configure<SiteRuntimeOptions>(config.GetSection("ScadaBridge:SiteRuntime"));
|
||||
services.Configure<DataConnectionOptions>(config.GetSection("ScadaBridge:DataConnection"));
|
||||
services.Configure<StoreAndForwardOptions>(config.GetSection("ScadaBridge:StoreAndForward"));
|
||||
services.Configure<SiteEventLogOptions>(config.GetSection("ScadaBridge:SiteEventLog"));
|
||||
}
|
||||
|
||||
/// <summary>Binds shared options sections (Node, Cluster, Database, Communication, etc.) used by both site and central roles.</summary>
|
||||
/// <param name="services">The service collection to bind options into.</param>
|
||||
/// <param name="config">Application configuration supplying the option values.</param>
|
||||
public static void BindSharedOptions(IServiceCollection services, IConfiguration config)
|
||||
{
|
||||
services.Configure<NodeOptions>(config.GetSection("ScadaBridge:Node"));
|
||||
services.Configure<ClusterOptions>(config.GetSection("ScadaBridge:Cluster"));
|
||||
services.Configure<DatabaseOptions>(config.GetSection("ScadaBridge:Database"));
|
||||
services.Configure<CommunicationOptions>(config.GetSection("ScadaBridge:Communication"));
|
||||
services.Configure<HealthMonitoringOptions>(config.GetSection("ScadaBridge:HealthMonitoring"));
|
||||
services.Configure<NotificationOptions>(config.GetSection("ScadaBridge:Notification"));
|
||||
services.Configure<LoggingOptions>(config.GetSection("ScadaBridge:Logging"));
|
||||
|
||||
// Audit Log (#23) — exposes ScadaBridge:Node:NodeName to downstream audit
|
||||
// writers so they can stamp the SourceNode column. Registered here in
|
||||
// shared bootstrap because every node (central + site) needs it.
|
||||
services.AddSingleton<INodeIdentityProvider, NodeIdentityProvider>();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,122 @@
|
||||
namespace ZB.MOM.WW.ScadaBridge.Host;
|
||||
|
||||
/// <summary>
|
||||
/// Bounded retry-with-backoff for startup preconditions.
|
||||
///
|
||||
/// Host-010 / REQ-HOST-4a: a Central node applies/validates database migrations
|
||||
/// before the host begins serving traffic. In container orchestration the database
|
||||
/// and the app frequently start together, so the database may be briefly
|
||||
/// unreachable. Rather than crashing the process on the first connection failure,
|
||||
/// the migration step is wrapped in this bounded exponential backoff: it tolerates a
|
||||
/// short outage and only fails fatally once attempts are exhausted.
|
||||
///
|
||||
/// Host-015: only <em>transient</em> faults are retried. The optional
|
||||
/// <c>isTransient</c> predicate classifies each exception; a permanent failure
|
||||
/// (e.g. a database schema-version mismatch — which no amount of waiting can fix)
|
||||
/// is rethrown immediately rather than being retried for minutes before the
|
||||
/// inevitable fatal exit.
|
||||
/// </summary>
|
||||
public static class StartupRetry
|
||||
{
|
||||
/// <summary>
|
||||
/// Executes an asynchronous operation with bounded exponential backoff, retrying only transient faults.
|
||||
/// </summary>
|
||||
/// <param name="operationName">Human-readable name of the operation, used in log messages.</param>
|
||||
/// <param name="operation">The operation to attempt.</param>
|
||||
/// <param name="maxAttempts">Maximum number of attempts before the exception propagates.</param>
|
||||
/// <param name="initialDelay">Delay before the second attempt; doubled on each subsequent retry, capped at 30 seconds.</param>
|
||||
/// <param name="logger">Logger for retry warnings.</param>
|
||||
/// <param name="isTransient">Optional predicate classifying an exception as transient; null means all exceptions are transient.</param>
|
||||
/// <param name="cancellationToken">Cancellation token that aborts the retry loop immediately.</param>
|
||||
public static Task ExecuteWithRetryAsync(
|
||||
string operationName,
|
||||
Func<Task> operation,
|
||||
int maxAttempts,
|
||||
TimeSpan initialDelay,
|
||||
ILogger logger,
|
||||
Func<Exception, bool>? isTransient = null,
|
||||
CancellationToken cancellationToken = default)
|
||||
=> ExecuteWithRetryAsync(operationName, _ => operation(), maxAttempts, initialDelay, logger, isTransient, cancellationToken);
|
||||
|
||||
/// <summary>
|
||||
/// Executes an asynchronous operation with bounded exponential backoff, retrying only transient faults.
|
||||
/// Overload that forwards the retry-loop cancellation token to the operation itself —
|
||||
/// Host-019: needed so callers (e.g. the database-migration step) can honour
|
||||
/// <c>IHostApplicationLifetime.ApplicationStopping</c> inside the operation as well
|
||||
/// as inside the inter-attempt <c>Task.Delay</c>.
|
||||
/// </summary>
|
||||
public static async Task ExecuteWithRetryAsync(
|
||||
string operationName,
|
||||
Func<CancellationToken, Task> operation,
|
||||
int maxAttempts,
|
||||
TimeSpan initialDelay,
|
||||
ILogger logger,
|
||||
Func<Exception, bool>? isTransient = null,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
// Default: treat every exception as transient (preserves the pre-Host-015
|
||||
// behaviour for callers that do not classify faults).
|
||||
isTransient ??= static _ => true;
|
||||
|
||||
var delay = initialDelay;
|
||||
for (var attempt = 1; ; attempt++)
|
||||
{
|
||||
cancellationToken.ThrowIfCancellationRequested();
|
||||
try
|
||||
{
|
||||
await operation(cancellationToken);
|
||||
if (attempt > 1)
|
||||
logger.LogInformation(
|
||||
"Startup operation '{Operation}' succeeded on attempt {Attempt}.",
|
||||
operationName, attempt);
|
||||
return;
|
||||
}
|
||||
catch (Exception ex) when (attempt < maxAttempts && isTransient(ex))
|
||||
{
|
||||
logger.LogWarning(ex,
|
||||
"Startup operation '{Operation}' failed (transient) on attempt {Attempt}/{MaxAttempts}; " +
|
||||
"retrying in {Delay}.",
|
||||
operationName, attempt, maxAttempts, delay);
|
||||
await Task.Delay(delay, cancellationToken);
|
||||
// Exponential backoff, capped so the total wait stays bounded.
|
||||
delay = TimeSpan.FromTicks(Math.Min(delay.Ticks * 2, TimeSpan.FromSeconds(30).Ticks));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Transient-fault classifier for the database-migration startup step (Host-015).
|
||||
/// Returns <c>true</c> only for connection-class faults that a brief wait can
|
||||
/// resolve — a SQL connection/transport error or a timeout — and <c>false</c>
|
||||
/// for everything else (notably schema-validation <see cref="InvalidOperationException"/>s
|
||||
/// raised by <c>MigrationHelper.ApplyOrValidateMigrationsAsync</c>, which are
|
||||
/// permanent and must fail fast).
|
||||
/// </summary>
|
||||
/// <param name="ex">The exception to classify.</param>
|
||||
public static bool IsTransientDatabaseFault(Exception ex)
|
||||
{
|
||||
// Unwrap a single layer of aggregation so a faulted Task surfaces correctly.
|
||||
if (ex is AggregateException agg && agg.InnerException != null)
|
||||
ex = agg.InnerException;
|
||||
|
||||
if (ex is TimeoutException)
|
||||
return true;
|
||||
|
||||
// Socket / network errors raised while opening the connection.
|
||||
if (ex is System.Net.Sockets.SocketException)
|
||||
return true;
|
||||
|
||||
// Microsoft.Data.SqlClient throws SqlException; matching by type name keeps
|
||||
// the Host free of a direct SqlClient package reference. A SqlException at
|
||||
// the migration stage is, in practice, a connection failure (the server is
|
||||
// not yet reachable) rather than a schema fault — schema mismatches surface
|
||||
// as InvalidOperationException from the migration helper.
|
||||
var typeName = ex.GetType().FullName;
|
||||
if (typeName != null &&
|
||||
(typeName.EndsWith("SqlException", StringComparison.Ordinal) ||
|
||||
typeName.EndsWith("DbException", StringComparison.Ordinal)))
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,101 @@
|
||||
namespace ZB.MOM.WW.ScadaBridge.Host;
|
||||
|
||||
/// <summary>
|
||||
/// Validates required configuration before Akka.NET actor system creation.
|
||||
/// Runs early in startup to fail fast with clear error messages.
|
||||
/// </summary>
|
||||
public static class StartupValidator
|
||||
{
|
||||
/// <summary>Validates required configuration values and throws <see cref="InvalidOperationException"/> listing all errors if any are found.</summary>
|
||||
/// <param name="configuration">The application configuration to validate.</param>
|
||||
public static void Validate(IConfiguration configuration)
|
||||
{
|
||||
var errors = new List<string>();
|
||||
|
||||
var nodeSection = configuration.GetSection("ScadaBridge:Node");
|
||||
var role = nodeSection["Role"];
|
||||
if (string.IsNullOrEmpty(role) || (role != "Central" && role != "Site"))
|
||||
errors.Add("ScadaBridge:Node:Role must be 'Central' or 'Site'");
|
||||
|
||||
if (string.IsNullOrEmpty(nodeSection["NodeHostname"]))
|
||||
errors.Add("ScadaBridge:Node:NodeHostname is required");
|
||||
|
||||
var portStr = nodeSection["RemotingPort"];
|
||||
if (!int.TryParse(portStr, out var port) || port < 1 || port > 65535)
|
||||
errors.Add("ScadaBridge:Node:RemotingPort must be 1-65535");
|
||||
|
||||
if (role == "Site" && string.IsNullOrEmpty(nodeSection["SiteId"]))
|
||||
errors.Add("ScadaBridge:Node:SiteId is required for Site nodes");
|
||||
|
||||
if (role == "Central")
|
||||
{
|
||||
var dbSection = configuration.GetSection("ScadaBridge:Database");
|
||||
if (string.IsNullOrEmpty(dbSection["ConfigurationDb"]))
|
||||
errors.Add("ScadaBridge:Database:ConfigurationDb connection string required for Central");
|
||||
|
||||
var secSection = configuration.GetSection("ScadaBridge:Security");
|
||||
if (string.IsNullOrEmpty(secSection["LdapServer"]))
|
||||
errors.Add("ScadaBridge:Security:LdapServer required for Central");
|
||||
if (string.IsNullOrEmpty(secSection["JwtSigningKey"]))
|
||||
errors.Add("ScadaBridge:Security:JwtSigningKey required for Central");
|
||||
}
|
||||
|
||||
var seedNodes = configuration.GetSection("ScadaBridge:Cluster:SeedNodes").Get<List<string>>();
|
||||
if (seedNodes == null || seedNodes.Count < 2)
|
||||
errors.Add("ScadaBridge:Cluster:SeedNodes must have at least 2 entries");
|
||||
|
||||
if (role == "Site")
|
||||
{
|
||||
var grpcPortStr = nodeSection["GrpcPort"];
|
||||
int grpcPort = 8083; // NodeOptions default when the key is absent
|
||||
if (grpcPortStr != null && (!int.TryParse(grpcPortStr, out grpcPort) || grpcPort < 1 || grpcPort > 65535))
|
||||
errors.Add("ScadaBridge:Node:GrpcPort must be 1-65535");
|
||||
|
||||
// Host-007 / REQ-HOST-4: the gRPC (Kestrel HTTP/2) port and the Akka
|
||||
// remoting port must differ. Identical values make Kestrel and
|
||||
// Akka.Remote contend for the same TCP port and fail opaquely at
|
||||
// runtime. Uses the resolved GrpcPort, including the 8083 default.
|
||||
if (port == grpcPort)
|
||||
errors.Add("ScadaBridge:Node:GrpcPort must differ from RemotingPort");
|
||||
|
||||
var dbSection = configuration.GetSection("ScadaBridge:Database");
|
||||
if (string.IsNullOrEmpty(dbSection["SiteDbPath"]))
|
||||
errors.Add("ScadaBridge:Database:SiteDbPath required for Site nodes");
|
||||
|
||||
// Host-004: a seed node must reference an Akka.Remote endpoint, never the
|
||||
// Kestrel HTTP/2 gRPC port. A seed entry whose port equals this node's
|
||||
// GrpcPort would make a joining node attempt an Akka.Remote TCP
|
||||
// association against the gRPC listener and fail.
|
||||
if (seedNodes != null)
|
||||
{
|
||||
foreach (var seed in seedNodes)
|
||||
{
|
||||
if (SeedNodePort(seed) == grpcPort)
|
||||
errors.Add(
|
||||
$"ScadaBridge:Cluster:SeedNodes entry '{seed}' must not target the gRPC port " +
|
||||
$"({grpcPort}); seed nodes must reference Akka remoting ports");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (errors.Count > 0)
|
||||
throw new InvalidOperationException(
|
||||
$"Configuration validation failed:\n{string.Join("\n", errors.Select(e => $" - {e}"))}");
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Extracts the TCP port from an Akka seed-node address of the form
|
||||
/// <c>akka.tcp://system@host:port</c>. Returns <c>-1</c> when no port can be parsed.
|
||||
/// </summary>
|
||||
private static int SeedNodePort(string seedNode)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(seedNode))
|
||||
return -1;
|
||||
|
||||
var lastColon = seedNode.LastIndexOf(':');
|
||||
if (lastColon < 0 || lastColon == seedNode.Length - 1)
|
||||
return -1;
|
||||
|
||||
return int.TryParse(seedNode[(lastColon + 1)..], out var port) ? port : -1;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,35 @@
|
||||
using Microsoft.Extensions.Options;
|
||||
using ZB.MOM.WW.ScadaBridge.StoreAndForward;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.Host;
|
||||
|
||||
/// <summary>
|
||||
/// Audit Log #23 (M3 Bundle F): Host-side adapter implementing the
|
||||
/// optional <see cref="IStoreAndForwardSiteContext"/> the Store-and-Forward
|
||||
/// service consults to stamp cached-call audit telemetry with the site id.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// Forwards <see cref="NodeOptions.SiteId"/> verbatim — the same value
|
||||
/// <see cref="SiteIdentityProvider"/> exposes to HealthMonitoring. Defined as
|
||||
/// a separate adapter (rather than reusing <see cref="SiteIdentityProvider"/>)
|
||||
/// to avoid pulling HealthMonitoring into the StoreAndForward project's
|
||||
/// dependency graph, which would create a project-reference cycle.
|
||||
/// </remarks>
|
||||
public class StoreAndForwardSiteContext : IStoreAndForwardSiteContext
|
||||
{
|
||||
/// <inheritdoc />
|
||||
public string SiteId { get; }
|
||||
|
||||
/// <summary>Initializes a new instance of <see cref="StoreAndForwardSiteContext"/>.</summary>
|
||||
/// <param name="nodeOptions">Node options supplying the site identifier.</param>
|
||||
public StoreAndForwardSiteContext(IOptions<NodeOptions> nodeOptions)
|
||||
{
|
||||
// NodeOptions.SiteId is nullable; SiteServiceRegistration ONLY adds
|
||||
// this binding on the site role, so a non-null site id is expected
|
||||
// here. Mirror SiteIdentityProvider's hard fail so a missing site id
|
||||
// surfaces at composition time rather than at the first cached call.
|
||||
SiteId = nodeOptions.Value.SiteId
|
||||
?? throw new InvalidOperationException(
|
||||
"ScadaBridge:Node:SiteId is required for the site role's StoreAndForward wiring.");
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,58 @@
|
||||
<Project Sdk="Microsoft.NET.Sdk.Web">
|
||||
|
||||
<PropertyGroup>
|
||||
<TargetFramework>net10.0</TargetFramework>
|
||||
<ImplicitUsings>enable</ImplicitUsings>
|
||||
<Nullable>enable</Nullable>
|
||||
<TreatWarningsAsErrors>true</TreatWarningsAsErrors>
|
||||
</PropertyGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<InternalsVisibleTo Include="ZB.MOM.WW.ScadaBridge.Host.Tests" />
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<PackageReference Include="Akka.Cluster.Hosting" />
|
||||
<PackageReference Include="Akka.Cluster.Tools" />
|
||||
<PackageReference Include="Akka.Hosting" />
|
||||
<PackageReference Include="Akka.Remote.Hosting" />
|
||||
<PackageReference Include="AspNetCore.HealthChecks.UI.Client" />
|
||||
<PackageReference Include="Microsoft.EntityFrameworkCore.Design">
|
||||
<IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
|
||||
<PrivateAssets>all</PrivateAssets>
|
||||
</PackageReference>
|
||||
<PackageReference Include="Grpc.AspNetCore" />
|
||||
<PackageReference Include="Microsoft.Extensions.Hosting.WindowsServices" />
|
||||
<PackageReference Include="Serilog.AspNetCore" />
|
||||
<PackageReference Include="Serilog.Sinks.Console" />
|
||||
<PackageReference Include="Serilog.Sinks.File" />
|
||||
<!-- Transitive override: Akka.Hosting 1.5.62 pins OpenTelemetry.Api 1.9.0 which is flagged
|
||||
(GHSA-g94r-2vxg-569j, GHSA-8785-wc3w-h8q6). Bumping directly clears both advisories. -->
|
||||
<PackageReference Include="OpenTelemetry.Api" />
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<ProjectReference Include="../ZB.MOM.WW.ScadaBridge.Commons/ZB.MOM.WW.ScadaBridge.Commons.csproj" />
|
||||
<ProjectReference Include="../ZB.MOM.WW.ScadaBridge.TemplateEngine/ZB.MOM.WW.ScadaBridge.TemplateEngine.csproj" />
|
||||
<ProjectReference Include="../ZB.MOM.WW.ScadaBridge.DeploymentManager/ZB.MOM.WW.ScadaBridge.DeploymentManager.csproj" />
|
||||
<ProjectReference Include="../ZB.MOM.WW.ScadaBridge.SiteRuntime/ZB.MOM.WW.ScadaBridge.SiteRuntime.csproj" />
|
||||
<ProjectReference Include="../ZB.MOM.WW.ScadaBridge.DataConnectionLayer/ZB.MOM.WW.ScadaBridge.DataConnectionLayer.csproj" />
|
||||
<ProjectReference Include="../ZB.MOM.WW.ScadaBridge.Communication/ZB.MOM.WW.ScadaBridge.Communication.csproj" />
|
||||
<ProjectReference Include="../ZB.MOM.WW.ScadaBridge.StoreAndForward/ZB.MOM.WW.ScadaBridge.StoreAndForward.csproj" />
|
||||
<ProjectReference Include="../ZB.MOM.WW.ScadaBridge.ExternalSystemGateway/ZB.MOM.WW.ScadaBridge.ExternalSystemGateway.csproj" />
|
||||
<ProjectReference Include="../ZB.MOM.WW.ScadaBridge.NotificationService/ZB.MOM.WW.ScadaBridge.NotificationService.csproj" />
|
||||
<ProjectReference Include="../ZB.MOM.WW.ScadaBridge.NotificationOutbox/ZB.MOM.WW.ScadaBridge.NotificationOutbox.csproj" />
|
||||
<ProjectReference Include="../ZB.MOM.WW.ScadaBridge.Transport/ZB.MOM.WW.ScadaBridge.Transport.csproj" />
|
||||
<ProjectReference Include="../ZB.MOM.WW.ScadaBridge.AuditLog/ZB.MOM.WW.ScadaBridge.AuditLog.csproj" />
|
||||
<ProjectReference Include="../ZB.MOM.WW.ScadaBridge.SiteCallAudit/ZB.MOM.WW.ScadaBridge.SiteCallAudit.csproj" />
|
||||
<ProjectReference Include="../ZB.MOM.WW.ScadaBridge.CentralUI/ZB.MOM.WW.ScadaBridge.CentralUI.csproj" />
|
||||
<ProjectReference Include="../ZB.MOM.WW.ScadaBridge.Security/ZB.MOM.WW.ScadaBridge.Security.csproj" />
|
||||
<ProjectReference Include="../ZB.MOM.WW.ScadaBridge.HealthMonitoring/ZB.MOM.WW.ScadaBridge.HealthMonitoring.csproj" />
|
||||
<ProjectReference Include="../ZB.MOM.WW.ScadaBridge.SiteEventLogging/ZB.MOM.WW.ScadaBridge.SiteEventLogging.csproj" />
|
||||
<ProjectReference Include="../ZB.MOM.WW.ScadaBridge.ClusterInfrastructure/ZB.MOM.WW.ScadaBridge.ClusterInfrastructure.csproj" />
|
||||
<ProjectReference Include="../ZB.MOM.WW.ScadaBridge.InboundAPI/ZB.MOM.WW.ScadaBridge.InboundAPI.csproj" />
|
||||
<ProjectReference Include="../ZB.MOM.WW.ScadaBridge.ConfigurationDatabase/ZB.MOM.WW.ScadaBridge.ConfigurationDatabase.csproj" />
|
||||
<ProjectReference Include="../ZB.MOM.WW.ScadaBridge.ManagementService/ZB.MOM.WW.ScadaBridge.ManagementService.csproj" />
|
||||
</ItemGroup>
|
||||
|
||||
</Project>
|
||||
@@ -0,0 +1,8 @@
|
||||
@using System.Net.Http
|
||||
@using Microsoft.AspNetCore.Authorization
|
||||
@using Microsoft.AspNetCore.Components.Authorization
|
||||
@using Microsoft.AspNetCore.Components.Forms
|
||||
@using Microsoft.AspNetCore.Components.Routing
|
||||
@using Microsoft.AspNetCore.Components.Web
|
||||
@using static Microsoft.AspNetCore.Components.Web.RenderMode
|
||||
@using ZB.MOM.WW.ScadaBridge.CentralUI.Components.Shared
|
||||
@@ -0,0 +1,77 @@
|
||||
{
|
||||
"ScadaBridge": {
|
||||
"_nodeName": "Host-018: NodeName stamps SourceNode on AuditLog/Notifications/SiteCalls rows (CLAUDE.md 'Centralized Audit Log' decision) and backs IX_AuditLog_Node_Occurred. Convention: 'central-a'/'central-b' for central nodes, 'node-a'/'node-b' for site nodes. Override per-node in multi-node deployments (the docker per-node configs do this). When left at the default below, single-node dev rows are stamped with 'central-a'; an empty value normalises to a NULL SourceNode.",
|
||||
"Node": {
|
||||
"Role": "Central",
|
||||
"NodeHostname": "localhost",
|
||||
"RemotingPort": 8081,
|
||||
"NodeName": "central-a"
|
||||
},
|
||||
"Cluster": {
|
||||
"SeedNodes": [
|
||||
"akka.tcp://scadabridge@localhost:8081",
|
||||
"akka.tcp://scadabridge@localhost:8082"
|
||||
],
|
||||
"SplitBrainResolverStrategy": "keep-oldest",
|
||||
"StableAfter": "00:00:15",
|
||||
"HeartbeatInterval": "00:00:02",
|
||||
"FailureDetectionThreshold": "00:00:10",
|
||||
"MinNrOfMembers": 1
|
||||
},
|
||||
"_secrets": "Host-003: Secrets are NOT committed in this file. Supply them via environment variables, which the Host's configuration builder (AddEnvironmentVariables) overlays over this file. Required: ScadaBridge__Database__ConfigurationDb, ScadaBridge__Security__LdapServiceAccountPassword, ScadaBridge__Security__JwtSigningKey. The ${...} placeholders below are intentionally non-functional and must be overridden per environment.",
|
||||
"Database": {
|
||||
"ConfigurationDb": "${SCADALINK_CONFIGURATIONDB_CONNECTION_STRING}"
|
||||
},
|
||||
"Security": {
|
||||
"LdapServer": "localhost",
|
||||
"LdapPort": 3893,
|
||||
"LdapUseTls": false,
|
||||
"AllowInsecureLdap": true,
|
||||
"LdapSearchBase": "dc=scadabridge,dc=local",
|
||||
"LdapServiceAccountDn": "cn=admin,dc=scadabridge,dc=local",
|
||||
"LdapServiceAccountPassword": "${SCADALINK_LDAP_SERVICE_ACCOUNT_PASSWORD}",
|
||||
"JwtSigningKey": "${SCADALINK_JWT_SIGNING_KEY}",
|
||||
"JwtExpiryMinutes": 15,
|
||||
"IdleTimeoutMinutes": 30
|
||||
},
|
||||
"Communication": {
|
||||
"DeploymentTimeout": "00:02:00",
|
||||
"LifecycleTimeout": "00:00:30",
|
||||
"QueryTimeout": "00:00:30",
|
||||
"TransportHeartbeatInterval": "00:00:05",
|
||||
"TransportFailureThreshold": "00:00:15"
|
||||
},
|
||||
"HealthMonitoring": {
|
||||
"ReportInterval": "00:00:30",
|
||||
"OfflineTimeout": "00:01:00"
|
||||
},
|
||||
"InboundApi": {
|
||||
"DefaultMethodTimeout": "00:00:30"
|
||||
},
|
||||
"Notification": {
|
||||
"SmtpServer": "localhost",
|
||||
"SmtpPort": 1025,
|
||||
"AuthMode": "None",
|
||||
"FromAddress": "scada-notifications@company.com"
|
||||
},
|
||||
"NotificationOutbox": {
|
||||
"DispatchInterval": "00:00:10",
|
||||
"DispatchBatchSize": 100,
|
||||
"StuckAgeThreshold": "00:10:00",
|
||||
"TerminalRetention": "365.00:00:00",
|
||||
"PurgeInterval": "1.00:00:00",
|
||||
"DeliveredKpiWindow": "00:01:00"
|
||||
},
|
||||
"Transport": {
|
||||
"BundleSessionTtlMinutes": 30,
|
||||
"MaxBundleSizeMb": 100,
|
||||
"MaxUnlockAttemptsPerSession": 3,
|
||||
"MaxUnlockAttemptsPerIpPerHour": 10,
|
||||
"Pbkdf2Iterations": 600000,
|
||||
"SchemaVersionMajor": 1
|
||||
},
|
||||
"Logging": {
|
||||
"MinimumLevel": "Information"
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,60 @@
|
||||
{
|
||||
"ScadaBridge": {
|
||||
"_nodeName": "Host-018: NodeName stamps SourceNode on AuditLog/Notifications/SiteCalls rows (CLAUDE.md 'Centralized Audit Log' decision) and backs IX_AuditLog_Node_Occurred. Convention: 'node-a'/'node-b' for site nodes, 'central-a'/'central-b' for central nodes. Override per-node in multi-node deployments (the docker per-node configs do this). When left at the default below, single-node dev rows are stamped with 'node-a'; an empty value normalises to a NULL SourceNode.",
|
||||
"Node": {
|
||||
"Role": "Site",
|
||||
"NodeHostname": "localhost",
|
||||
"SiteId": "site-a",
|
||||
"RemotingPort": 8082,
|
||||
"GrpcPort": 8083,
|
||||
"NodeName": "node-a"
|
||||
},
|
||||
"Cluster": {
|
||||
"SeedNodes": [
|
||||
"akka.tcp://scadabridge@localhost:8082",
|
||||
"akka.tcp://scadabridge@localhost:8084"
|
||||
],
|
||||
"SplitBrainResolverStrategy": "keep-oldest",
|
||||
"StableAfter": "00:00:15",
|
||||
"HeartbeatInterval": "00:00:02",
|
||||
"FailureDetectionThreshold": "00:00:10",
|
||||
"MinNrOfMembers": 1
|
||||
},
|
||||
"Database": {
|
||||
"SiteDbPath": "./data/scadabridge.db"
|
||||
},
|
||||
"DataConnection": {
|
||||
"ReconnectInterval": "00:00:05",
|
||||
"TagResolutionRetryInterval": "00:00:10",
|
||||
"WriteTimeout": "00:00:30"
|
||||
},
|
||||
"StoreAndForward": {
|
||||
"SqliteDbPath": "./data/store-and-forward.db",
|
||||
"ReplicationEnabled": true
|
||||
},
|
||||
"Communication": {
|
||||
"_centralContactPoints": "Host-016: each entry MUST be a central node's remoting endpoint, NOT this site's own remoting port. The single dev-loopback default below points only at central-a (localhost:8081). In a multi-central deployment add the second central node here (e.g. 'akka.tcp://scadabridge@central-b-host:8081') so ClusterClient can fail over when central-a is down. The previous template listed localhost:8082 as the second contact — that is THIS site's own RemotingPort and is a permanent failure in the initial-contact rotation.",
|
||||
"CentralContactPoints": [
|
||||
"akka.tcp://scadabridge@localhost:8081"
|
||||
],
|
||||
"DeploymentTimeout": "00:02:00",
|
||||
"LifecycleTimeout": "00:00:30",
|
||||
"QueryTimeout": "00:00:30",
|
||||
"TransportHeartbeatInterval": "00:00:05",
|
||||
"TransportFailureThreshold": "00:00:15"
|
||||
},
|
||||
"HealthMonitoring": {
|
||||
"ReportInterval": "00:00:30",
|
||||
"OfflineTimeout": "00:01:00"
|
||||
},
|
||||
"SiteEventLog": {
|
||||
"RetentionDays": 30,
|
||||
"MaxStorageMb": 1024,
|
||||
"PurgeScheduleCron": "0 2 * * *"
|
||||
},
|
||||
"Notification": {},
|
||||
"Logging": {
|
||||
"MinimumLevel": "Information"
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,24 @@
|
||||
{
|
||||
"_logging": "Host-021: Serilog is the sole logger provider (Program.cs calls builder.Host.UseSerilog()), so the standard Microsoft 'Logging:LogLevel' block has no effect and was removed. The minimum level is set via 'ScadaBridge:Logging:MinimumLevel' (bound to LoggingOptions per Host-011); sinks are defined under the 'Serilog' section below and applied via ReadFrom.Configuration (Host-014). See LoggerConfigurationFactory + Component-Host.md REQ-HOST-8.",
|
||||
"Serilog": {
|
||||
"Using": [
|
||||
"Serilog.Sinks.Console",
|
||||
"Serilog.Sinks.File"
|
||||
],
|
||||
"WriteTo": [
|
||||
{
|
||||
"Name": "Console",
|
||||
"Args": {
|
||||
"outputTemplate": "[{Timestamp:HH:mm:ss} {Level:u3}] [{NodeRole}/{NodeHostname}] {Message:lj}{NewLine}{Exception}"
|
||||
}
|
||||
},
|
||||
{
|
||||
"Name": "File",
|
||||
"Args": {
|
||||
"path": "logs/scadabridge-.log",
|
||||
"rollingInterval": "Day"
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,16 @@
|
||||
window.treeviewStorage = {
|
||||
save: function (storageKey, keysJson) {
|
||||
sessionStorage.setItem("treeview:" + storageKey, keysJson);
|
||||
},
|
||||
load: function (storageKey) {
|
||||
return sessionStorage.getItem("treeview:" + storageKey);
|
||||
},
|
||||
// Blazor cannot bind input.indeterminate natively (only `checked`). The
|
||||
// TreeView's Checkbox-selection mode calls this from OnAfterRenderAsync to
|
||||
// toggle the tri-state visual on each render.
|
||||
setIndeterminate: function (el, value) {
|
||||
if (el) {
|
||||
el.indeterminate = !!value;
|
||||
}
|
||||
}
|
||||
};
|
||||
+2078
File diff suppressed because it is too large
Load Diff
Binary file not shown.
BIN
Binary file not shown.
File diff suppressed because one or more lines are too long
+7
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user