refactor: rename ScadaLink → ZB.MOM.WW.ScadaBridge (code + projects + namespaces)
Solution + 23 src projects + 26 test projects renamed; folders, csproj, namespaces, and ScadaLinkDbContext/ScadaBridgeDbContext class updated. ActorSystem "scadalink" → "scadabridge", Akka seed-node URLs migrated. SQL roles/logins, LDAP domains, CLI command name, and CLI config dir (~/.scadalink → ~/.scadabridge) also renamed. Build green; 5 Host.Tests fail awaiting SQL login rename in next commit. Pre-existing StaleTagMonitor timing flakes unchanged. Rename script committed at tools/rename-to-scadabridge.sh.
This commit is contained in:
@@ -0,0 +1,582 @@
|
||||
using System.Collections.Immutable;
|
||||
using Akka.Actor;
|
||||
using Akka.Cluster.Tools.Client;
|
||||
using Akka.Cluster.Tools.PublishSubscribe;
|
||||
using Akka.Event;
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Repositories;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Audit;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Communication;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Health;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Notification;
|
||||
using ZB.MOM.WW.ScadaBridge.HealthMonitoring;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.Communication.Actors;
|
||||
|
||||
/// <summary>
|
||||
/// Abstraction for creating ClusterClient instances per site, enabling testability.
|
||||
/// </summary>
|
||||
public interface ISiteClientFactory
|
||||
{
|
||||
/// <summary>Creates a ClusterClient actor for the given site with the specified contact points.</summary>
|
||||
/// <param name="system">The actor system in which to create the client.</param>
|
||||
/// <param name="siteId">The site identifier, used to name the actor.</param>
|
||||
/// <param name="contacts">The set of receptionist actor paths to use as initial contacts.</param>
|
||||
/// <returns>An actor reference for the new ClusterClient.</returns>
|
||||
IActorRef Create(ActorSystem system, string siteId, ImmutableHashSet<ActorPath> contacts);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Default implementation that creates a real ClusterClient for each site.
|
||||
/// </summary>
|
||||
public class DefaultSiteClientFactory : ISiteClientFactory
|
||||
{
|
||||
/// <inheritdoc />
|
||||
public IActorRef Create(ActorSystem system, string siteId, ImmutableHashSet<ActorPath> contacts)
|
||||
{
|
||||
var settings = ClusterClientSettings.Create(system).WithInitialContacts(contacts);
|
||||
return system.ActorOf(ClusterClient.Props(settings), $"site-client-{siteId}");
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Central-side actor that routes messages from central to site clusters via ClusterClient.
|
||||
/// Resolves site addresses from the database on a periodic refresh cycle and manages
|
||||
/// per-site ClusterClient instances.
|
||||
///
|
||||
/// WP-4: All 8 message patterns routed through this actor.
|
||||
/// WP-5: Ask timeout on connection drop (no central buffering). Debug streams killed on interruption.
|
||||
/// </summary>
|
||||
public class CentralCommunicationActor : ReceiveActor
|
||||
{
|
||||
private readonly ILoggingAdapter _log = Context.GetLogger();
|
||||
private readonly IServiceProvider _serviceProvider;
|
||||
private readonly ISiteClientFactory _siteClientFactory;
|
||||
|
||||
/// <summary>
|
||||
/// Per-site ClusterClient instances and their contact addresses.
|
||||
/// Maps SiteIdentifier → (ClusterClient actor, set of contact address strings).
|
||||
/// Refreshed periodically via RefreshSiteAddresses.
|
||||
/// </summary>
|
||||
private readonly Dictionary<string, (IActorRef Client, ImmutableHashSet<string> ContactAddresses)> _siteClients = new();
|
||||
|
||||
// Communication-016: the previous _debugSubscriptions / _inProgressDeployments
|
||||
// dictionaries existed solely to support a documented "synchronous kill streams +
|
||||
// mark deployments failed on site disconnect" workflow triggered by
|
||||
// ConnectionStateChanged. No production code ever emitted that message — only
|
||||
// the unit test did — so the workflow was dead from end to end. Disconnect
|
||||
// detection is owned by the underlying transports: the gRPC keepalive PING
|
||||
// signals stream interruption in ~25s (handled by DebugStreamBridgeActor's own
|
||||
// reconnection logic), and an Ask round-trip for a deploy times out at the
|
||||
// CommunicationService layer (caller sees failure). The tracking dicts +
|
||||
// ConnectionStateChanged record + HandleConnectionStateChanged handler are
|
||||
// removed; see docs/requirements/Component-Communication.md "Connection
|
||||
// Failure Behavior" for the keepalive-based contract that survives.
|
||||
|
||||
private ICancelable? _refreshSchedule;
|
||||
|
||||
/// <summary>
|
||||
/// Communication-019: per-actor lifecycle CTS threaded into the periodic
|
||||
/// <see cref="LoadSiteAddressesFromDb"/> repository call so a hung MS SQL
|
||||
/// connection is bounded by actor shutdown rather than holding piped tasks
|
||||
/// open indefinitely. Cancelled in <see cref="PostStop"/>; never reset.
|
||||
/// </summary>
|
||||
private readonly CancellationTokenSource _lifecycleCts = new();
|
||||
|
||||
/// <summary>
|
||||
/// Proxy <see cref="IActorRef"/> for the central NotificationOutboxActor cluster singleton.
|
||||
/// Set via <see cref="RegisterNotificationOutbox"/> — the Host creates the singleton proxy
|
||||
/// after this actor and registers it (mirrors how the site-side actor receives its
|
||||
/// runtime <see cref="IActorRef"/>s). Null until registration completes; a notification
|
||||
/// arriving before then is rejected with a non-accepted ack so the site retries.
|
||||
/// </summary>
|
||||
private IActorRef? _notificationOutboxProxy;
|
||||
|
||||
/// <summary>
|
||||
/// Proxy <see cref="IActorRef"/> for the central AuditLogIngestActor cluster
|
||||
/// singleton. Set via <see cref="RegisterAuditIngest"/> — the Host creates the
|
||||
/// singleton proxy after this actor and registers it (mirrors
|
||||
/// <see cref="_notificationOutboxProxy"/>). Null until registration completes;
|
||||
/// an audit ingest command arriving before then is answered with an empty
|
||||
/// reply so the site keeps its rows Pending and retries.
|
||||
///
|
||||
/// Once registered, the handler Asks this proxy and pipes the reply straight
|
||||
/// back to the caller. On an Ask timeout or a faulted reply, PipeTo forwards a
|
||||
/// <see cref="Status.Failure"/> to the caller — the fault propagates rather
|
||||
/// than being swallowed. This differs from the gRPC handler
|
||||
/// (<c>SiteStreamGrpcServer</c>), which catches the exception and returns an
|
||||
/// empty ack; here the faulted Ask is the transient signal the site relies on
|
||||
/// (see <see cref="HandleIngestAuditEvents"/>).
|
||||
/// </summary>
|
||||
private IActorRef? _auditIngestProxy;
|
||||
|
||||
/// <summary>
|
||||
/// Default Ask timeout for routing audit ingest commands to the
|
||||
/// AuditLogIngestActor proxy — 30 s, matching the value of
|
||||
/// <c>SiteStreamGrpcServer.AuditIngestAskTimeout</c> (that constant is private
|
||||
/// to the gRPC server and not reachable here, so it is declared locally). A
|
||||
/// generous window absorbs a slow MS SQL connection without the round-trip
|
||||
/// surfacing as a failure on a healthy site. When the window is exceeded the
|
||||
/// Ask faults and that fault is piped back to the caller as a
|
||||
/// <see cref="Status.Failure"/> (see <see cref="HandleIngestAuditEvents"/>).
|
||||
/// </summary>
|
||||
private static readonly TimeSpan DefaultAuditIngestAskTimeout = TimeSpan.FromSeconds(30);
|
||||
|
||||
/// <summary>
|
||||
/// Effective Ask timeout for audit ingest routing. Defaults to
|
||||
/// <see cref="DefaultAuditIngestAskTimeout"/>; overridable via the constructor
|
||||
/// so tests can exercise the timeout/fault path without waiting 30 s.
|
||||
/// </summary>
|
||||
private readonly TimeSpan _auditIngestAskTimeout;
|
||||
|
||||
/// <summary>
|
||||
/// DistributedPubSub topic used to fan health reports out to the peer
|
||||
/// central node so both per-node aggregators stay in sync. See
|
||||
/// <see cref="SiteHealthReportReplica"/> for the protocol rationale.
|
||||
/// </summary>
|
||||
private const string HealthReportTopic = "site-health-replica";
|
||||
|
||||
/// <summary>Initializes the <see cref="CentralCommunicationActor"/> and wires all message handlers.</summary>
|
||||
/// <param name="serviceProvider">DI service provider for scoped repository and aggregator access.</param>
|
||||
/// <param name="siteClientFactory">Factory used to create per-site ClusterClient actors.</param>
|
||||
/// <param name="auditIngestAskTimeout">
|
||||
/// Optional override for the audit-ingest Ask timeout; defaults to
|
||||
/// <see cref="DefaultAuditIngestAskTimeout"/> (30 s). Exists only so tests can
|
||||
/// exercise the timeout/fault path quickly — production always uses the default.
|
||||
/// </param>
|
||||
public CentralCommunicationActor(
|
||||
IServiceProvider serviceProvider,
|
||||
ISiteClientFactory siteClientFactory,
|
||||
TimeSpan? auditIngestAskTimeout = null)
|
||||
{
|
||||
_serviceProvider = serviceProvider;
|
||||
_siteClientFactory = siteClientFactory;
|
||||
_auditIngestAskTimeout = auditIngestAskTimeout ?? DefaultAuditIngestAskTimeout;
|
||||
|
||||
// Site address cache loaded from database
|
||||
Receive<SiteAddressCacheLoaded>(HandleSiteAddressCacheLoaded);
|
||||
|
||||
// Periodic refresh trigger
|
||||
Receive<RefreshSiteAddresses>(_ => LoadSiteAddressesFromDb());
|
||||
|
||||
// Communication-006: a faulted LoadSiteAddressesFromDb task is piped here as a
|
||||
// Status.Failure. Without this handler the failure was an unhandled message
|
||||
// (debug-level only) and the refresh failed silently — operators could not
|
||||
// distinguish "no sites configured" from "database is down". Log at Warning.
|
||||
Receive<Status.Failure>(failure =>
|
||||
_log.Warning(failure.Cause,
|
||||
"Failed to load site addresses from the database; the site ClusterClient "
|
||||
+ "cache was not refreshed and may be stale or empty"));
|
||||
|
||||
// Health monitoring: heartbeats and health reports from sites
|
||||
Receive<HeartbeatMessage>(HandleHeartbeat);
|
||||
Receive<SiteHealthReport>(HandleSiteHealthReport);
|
||||
Receive<SiteHealthReportReplica>(r => ProcessLocally(r.Report));
|
||||
Receive<SubscribeAck>(_ => { /* DistributedPubSub subscribe confirmation */ });
|
||||
|
||||
// Route enveloped messages to sites
|
||||
Receive<SiteEnvelope>(HandleSiteEnvelope);
|
||||
|
||||
// Notification Outbox: the Host registers the outbox singleton proxy after this
|
||||
// actor is created (the proxy cannot exist before this actor's construction).
|
||||
Receive<RegisterNotificationOutbox>(msg =>
|
||||
{
|
||||
_notificationOutboxProxy = msg.OutboxProxy;
|
||||
_log.Info("Registered notification outbox proxy");
|
||||
});
|
||||
|
||||
// Notification Outbox ingest: a site forwards a buffered NotificationSubmit to the
|
||||
// central cluster via ClusterClient. Forward to the outbox proxy so the original
|
||||
// Sender (the site's ClusterClient path) is preserved and the NotificationSubmitAck
|
||||
// routes straight back to the site.
|
||||
Receive<NotificationSubmit>(HandleNotificationSubmit);
|
||||
|
||||
// Notification Outbox status query: forward to the outbox proxy, preserving Sender
|
||||
// so the NotificationStatusResponse routes back to the querying site.
|
||||
Receive<NotificationStatusQuery>(HandleNotificationStatusQuery);
|
||||
|
||||
// Audit Log (#23): the Host registers the AuditLogIngestActor singleton
|
||||
// proxy after this actor is created (the proxy cannot exist before this
|
||||
// actor's construction).
|
||||
Receive<RegisterAuditIngest>(msg =>
|
||||
{
|
||||
_auditIngestProxy = msg.AuditIngestActor;
|
||||
_log.Info("Registered audit ingest proxy");
|
||||
});
|
||||
|
||||
// Audit Log (#23) site→central ingest: a site forwards a batch of audit
|
||||
// events to the central cluster via ClusterClient. Ask the ingest proxy
|
||||
// and pipe the IngestAuditEventsReply back to the original Sender (the
|
||||
// site's ClusterClient path) so the site can flip its rows to Forwarded.
|
||||
Receive<IngestAuditEventsCommand>(HandleIngestAuditEvents);
|
||||
|
||||
// Audit Log (#23 M3) combined-telemetry ingest: routes to the same proxy
|
||||
// the same way; the proxy replies with an IngestCachedTelemetryReply.
|
||||
Receive<IngestCachedTelemetryCommand>(HandleIngestCachedTelemetry);
|
||||
}
|
||||
|
||||
private void HandleNotificationSubmit(NotificationSubmit msg)
|
||||
{
|
||||
if (_notificationOutboxProxy == null)
|
||||
{
|
||||
// No outbox proxy registered yet. A non-accepted ack makes the site's
|
||||
// Store-and-Forward forwarder treat this as transient and retry later.
|
||||
_log.Warning(
|
||||
"Cannot route NotificationSubmit {0} — notification outbox not available",
|
||||
msg.NotificationId);
|
||||
Sender.Tell(new NotificationSubmitAck(
|
||||
msg.NotificationId, Accepted: false, Error: "notification outbox not available"));
|
||||
return;
|
||||
}
|
||||
|
||||
_log.Debug("Routing NotificationSubmit {0} to the notification outbox", msg.NotificationId);
|
||||
_notificationOutboxProxy.Forward(msg);
|
||||
}
|
||||
|
||||
private void HandleNotificationStatusQuery(NotificationStatusQuery msg)
|
||||
{
|
||||
if (_notificationOutboxProxy == null)
|
||||
{
|
||||
// No outbox proxy registered yet. Reply Found: false so the querying site
|
||||
// falls back to its local Store-and-Forward buffer to resolve the status.
|
||||
_log.Warning(
|
||||
"Cannot route NotificationStatusQuery {0} — notification outbox not available",
|
||||
msg.NotificationId);
|
||||
Sender.Tell(new NotificationStatusResponse(
|
||||
msg.CorrelationId, Found: false, Status: "Unknown",
|
||||
RetryCount: 0, LastError: null, DeliveredAt: null));
|
||||
return;
|
||||
}
|
||||
|
||||
_log.Debug("Routing NotificationStatusQuery {0} to the notification outbox", msg.NotificationId);
|
||||
_notificationOutboxProxy.Forward(msg);
|
||||
}
|
||||
|
||||
private void HandleIngestAuditEvents(IngestAuditEventsCommand msg)
|
||||
{
|
||||
if (_auditIngestProxy == null)
|
||||
{
|
||||
// No ingest proxy registered yet (host startup race). Reply with an
|
||||
// empty IngestAuditEventsReply so the site keeps its rows Pending and
|
||||
// retries — the same behaviour as the gRPC handler's wiring-race path.
|
||||
_log.Warning(
|
||||
"Cannot route IngestAuditEventsCommand ({0} events) — audit ingest not available",
|
||||
msg.Events.Count);
|
||||
Sender.Tell(new IngestAuditEventsReply(Array.Empty<Guid>()));
|
||||
return;
|
||||
}
|
||||
|
||||
// Capture Sender before the async/PipeTo — Akka resets Sender between
|
||||
// dispatches. The reply is piped straight back to the site's ClusterClient.
|
||||
// On an Ask timeout or a faulted reply, PipeTo delivers a Status.Failure to
|
||||
// replyTo: the fault propagates to the caller rather than being swallowed.
|
||||
// The site's own Ask through this path then faults, and the site drain loop
|
||||
// treats that as a transient failure — rows stay Pending and are retried on
|
||||
// the next tick. (The gRPC handler instead returns an empty ack on fault;
|
||||
// propagating the fault here is the cleaner transient signal.)
|
||||
var replyTo = Sender;
|
||||
_log.Debug("Routing IngestAuditEventsCommand ({0} events) to the audit ingest actor", msg.Events.Count);
|
||||
_auditIngestProxy.Ask<IngestAuditEventsReply>(msg, _auditIngestAskTimeout)
|
||||
.PipeTo(replyTo);
|
||||
}
|
||||
|
||||
private void HandleIngestCachedTelemetry(IngestCachedTelemetryCommand msg)
|
||||
{
|
||||
if (_auditIngestProxy == null)
|
||||
{
|
||||
_log.Warning(
|
||||
"Cannot route IngestCachedTelemetryCommand ({0} entries) — audit ingest not available",
|
||||
msg.Entries.Count);
|
||||
Sender.Tell(new IngestCachedTelemetryReply(Array.Empty<Guid>()));
|
||||
return;
|
||||
}
|
||||
|
||||
var replyTo = Sender;
|
||||
_log.Debug("Routing IngestCachedTelemetryCommand ({0} entries) to the audit ingest actor", msg.Entries.Count);
|
||||
_auditIngestProxy.Ask<IngestCachedTelemetryReply>(msg, _auditIngestAskTimeout)
|
||||
.PipeTo(replyTo);
|
||||
}
|
||||
|
||||
private void HandleHeartbeat(HeartbeatMessage heartbeat)
|
||||
{
|
||||
var aggregator = _serviceProvider.GetService<ICentralHealthAggregator>();
|
||||
aggregator?.MarkHeartbeat(heartbeat.SiteId, heartbeat.Timestamp);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Handles a report delivered directly from a site (via ClusterClient):
|
||||
/// process locally, then fan out to the peer central node so its
|
||||
/// aggregator stays in sync.
|
||||
/// </summary>
|
||||
private void HandleSiteHealthReport(SiteHealthReport report)
|
||||
{
|
||||
ProcessLocally(report);
|
||||
|
||||
try
|
||||
{
|
||||
DistributedPubSub.Get(Context.System).Mediator.Tell(
|
||||
new Publish(HealthReportTopic, new SiteHealthReportReplica(report)));
|
||||
}
|
||||
catch
|
||||
{
|
||||
// No-op in non-clustered hosts (TestKit).
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Applies a report to the local aggregator without re-broadcasting.
|
||||
/// Used for both site-originated reports and peer-replicated ones — the
|
||||
/// aggregator is idempotent via sequence-number comparison.
|
||||
/// </summary>
|
||||
private void ProcessLocally(SiteHealthReport report)
|
||||
{
|
||||
var aggregator = _serviceProvider.GetService<ICentralHealthAggregator>();
|
||||
if (aggregator != null)
|
||||
{
|
||||
aggregator.ProcessReport(report);
|
||||
}
|
||||
else
|
||||
{
|
||||
_log.Warning("ICentralHealthAggregator not available, dropping health report from site {0}", report.SiteId);
|
||||
}
|
||||
}
|
||||
|
||||
// Communication-016: HandleConnectionStateChanged removed — no production
|
||||
// caller emitted ConnectionStateChanged, so the workflow ran only in tests.
|
||||
// Disconnect detection is owned by the transport layers (gRPC keepalive +
|
||||
// ClusterClient/Ask timeout).
|
||||
|
||||
private void HandleSiteEnvelope(SiteEnvelope envelope)
|
||||
{
|
||||
if (!_siteClients.TryGetValue(envelope.SiteId, out var entry))
|
||||
{
|
||||
_log.Warning("No ClusterClient for site {0}, cannot route message {1}",
|
||||
envelope.SiteId, envelope.Message.GetType().Name);
|
||||
|
||||
// The Ask will timeout on the caller side — no central buffering (WP-5)
|
||||
return;
|
||||
}
|
||||
|
||||
// Route via ClusterClient — Sender is preserved for Ask response routing
|
||||
entry.Client.Tell(
|
||||
new ClusterClient.Send("/user/site-communication", envelope.Message),
|
||||
Sender);
|
||||
}
|
||||
|
||||
private void LoadSiteAddressesFromDb()
|
||||
{
|
||||
var self = Self;
|
||||
// Communication-019: pass the actor's lifecycle CT into the repository
|
||||
// call so a hung database query is cancelled when the actor stops
|
||||
// rather than leaving the piped task to accumulate. Captured locally
|
||||
// because the lifecycle CTS may have been disposed by PostStop on a
|
||||
// racing late tick; treat that as "actor gone, give up".
|
||||
CancellationToken ct;
|
||||
try
|
||||
{
|
||||
ct = _lifecycleCts.Token;
|
||||
}
|
||||
catch (ObjectDisposedException)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
Task.Run(async () =>
|
||||
{
|
||||
using var scope = _serviceProvider.CreateScope();
|
||||
var repo = scope.ServiceProvider.GetRequiredService<ISiteRepository>();
|
||||
var sites = await repo.GetAllSitesAsync(ct).ConfigureAwait(false);
|
||||
|
||||
var contacts = new Dictionary<string, List<string>>();
|
||||
foreach (var site in sites)
|
||||
{
|
||||
var addrs = new List<string>();
|
||||
if (!string.IsNullOrWhiteSpace(site.NodeAAddress))
|
||||
{
|
||||
var addr = site.NodeAAddress;
|
||||
// Strip actor path suffix if present (legacy format)
|
||||
var idx = addr.IndexOf("/user/");
|
||||
if (idx > 0) addr = addr.Substring(0, idx);
|
||||
addrs.Add(addr);
|
||||
}
|
||||
if (!string.IsNullOrWhiteSpace(site.NodeBAddress))
|
||||
{
|
||||
var addr = site.NodeBAddress;
|
||||
var idx = addr.IndexOf("/user/");
|
||||
if (idx > 0) addr = addr.Substring(0, idx);
|
||||
addrs.Add(addr);
|
||||
}
|
||||
if (addrs.Count > 0)
|
||||
contacts[site.SiteIdentifier] = addrs;
|
||||
}
|
||||
|
||||
// Communication-020: freeze the cross-task payload before piping to
|
||||
// Self. The message record exposes read-only types (
|
||||
// IReadOnlyDictionary / IReadOnlyList) so the Akka.NET message-
|
||||
// immutability convention is enforced by type, not just convention.
|
||||
var frozen = contacts.ToDictionary(
|
||||
kvp => kvp.Key,
|
||||
kvp => (IReadOnlyList<string>)kvp.Value.AsReadOnly());
|
||||
return new SiteAddressCacheLoaded(frozen);
|
||||
}).PipeTo(self);
|
||||
}
|
||||
|
||||
private void HandleSiteAddressCacheLoaded(SiteAddressCacheLoaded msg)
|
||||
{
|
||||
var newSiteIds = msg.SiteContacts.Keys.ToHashSet();
|
||||
var existingSiteIds = _siteClients.Keys.ToHashSet();
|
||||
|
||||
// Stop ClusterClients for removed sites
|
||||
foreach (var removed in existingSiteIds.Except(newSiteIds))
|
||||
{
|
||||
_log.Info("Stopping ClusterClient for removed site {0}", removed);
|
||||
Context.Stop(_siteClients[removed].Client);
|
||||
_siteClients.Remove(removed);
|
||||
}
|
||||
|
||||
// Add or update
|
||||
foreach (var (siteId, addresses) in msg.SiteContacts)
|
||||
{
|
||||
// Communication-009: parse all addresses up front inside a try/catch so a
|
||||
// single malformed site row cannot abort the whole refresh loop and leave
|
||||
// the cache half-updated. A bad site is logged and skipped; others proceed.
|
||||
ImmutableHashSet<ActorPath> contactPaths;
|
||||
try
|
||||
{
|
||||
contactPaths = addresses
|
||||
.Select(a => ActorPath.Parse($"{a}/system/receptionist"))
|
||||
.ToImmutableHashSet();
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_log.Warning(ex,
|
||||
"Malformed contact address for site {0}; skipping this site in the refresh "
|
||||
+ "(other sites are unaffected)", siteId);
|
||||
continue;
|
||||
}
|
||||
|
||||
var contactStrings = addresses.ToImmutableHashSet();
|
||||
|
||||
// Skip if unchanged
|
||||
if (_siteClients.TryGetValue(siteId, out var existing) && existing.ContactAddresses.SetEquals(contactStrings))
|
||||
continue;
|
||||
|
||||
// Stop old client if addresses changed
|
||||
if (_siteClients.ContainsKey(siteId))
|
||||
{
|
||||
_log.Info("Updating ClusterClient for site {0} (addresses changed)", siteId);
|
||||
Context.Stop(_siteClients[siteId].Client);
|
||||
}
|
||||
|
||||
var client = _siteClientFactory.Create(Context.System, siteId, contactPaths);
|
||||
_siteClients[siteId] = (client, contactStrings);
|
||||
_log.Info("Created ClusterClient for site {0} with {1} contact(s)", siteId, addresses.Count);
|
||||
}
|
||||
|
||||
_log.Info("Site ClusterClient cache refreshed with {0} site(s)", _siteClients.Count);
|
||||
}
|
||||
|
||||
// Communication-016: TrackMessageForCleanup removed — the dicts it fed
|
||||
// existed solely to support the dead ConnectionStateChanged workflow.
|
||||
|
||||
/// <inheritdoc />
|
||||
protected override SupervisorStrategy SupervisorStrategy()
|
||||
{
|
||||
return new OneForOneStrategy(
|
||||
maxNrOfRetries: -1,
|
||||
withinTimeRange: Timeout.InfiniteTimeSpan,
|
||||
decider: Decider.From(ex =>
|
||||
{
|
||||
_log.Warning(ex, "Child actor of CentralCommunicationActor faulted, resuming (state preserved)");
|
||||
return Directive.Resume;
|
||||
}));
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
protected override void PreStart()
|
||||
{
|
||||
_log.Info("CentralCommunicationActor started");
|
||||
|
||||
// Subscribe to the peer-replication topic so we receive health reports
|
||||
// delivered to the other central node and keep our local aggregator
|
||||
// in sync (ClusterClient load-balances reports across nodes).
|
||||
// Tolerant of non-clustered hosts (TestKit) where the extension is absent.
|
||||
try
|
||||
{
|
||||
DistributedPubSub.Get(Context.System).Mediator.Tell(
|
||||
new Subscribe(HealthReportTopic, Self));
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_log.Debug("DistributedPubSub not available — peer health replication disabled: {0}", ex.Message);
|
||||
}
|
||||
|
||||
// Schedule periodic refresh of site addresses from the database
|
||||
_refreshSchedule = Context.System.Scheduler.ScheduleTellRepeatedlyCancelable(
|
||||
TimeSpan.Zero,
|
||||
TimeSpan.FromSeconds(60),
|
||||
Self,
|
||||
new RefreshSiteAddresses(),
|
||||
ActorRefs.NoSender);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
protected override void PostStop()
|
||||
{
|
||||
_log.Info("CentralCommunicationActor stopped");
|
||||
_refreshSchedule?.Cancel();
|
||||
// Communication-019: cancel any in-flight LoadSiteAddressesFromDb so a
|
||||
// hung MS SQL query does not outlive the actor.
|
||||
try
|
||||
{
|
||||
_lifecycleCts.Cancel();
|
||||
}
|
||||
catch (ObjectDisposedException)
|
||||
{
|
||||
// Double-stop is benign.
|
||||
}
|
||||
_lifecycleCts.Dispose();
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Command to trigger a refresh of site addresses from the database.
|
||||
/// </summary>
|
||||
public record RefreshSiteAddresses;
|
||||
|
||||
/// <summary>
|
||||
/// Internal message carrying the loaded site contact data from the database.
|
||||
/// ClusterClient creation happens on the actor thread in HandleSiteAddressCacheLoaded.
|
||||
///
|
||||
/// Communication-020: the payload is exposed as <see cref="IReadOnlyDictionary{TKey,TValue}"/>
|
||||
/// of <see cref="IReadOnlyList{T}"/> so the Akka.NET "messages are immutable"
|
||||
/// convention is enforced at the type level rather than relying on producer
|
||||
/// discipline. The producer wraps the constructed buckets with
|
||||
/// <c>List<T>.AsReadOnly()</c> before piping to Self.
|
||||
/// </summary>
|
||||
internal record SiteAddressCacheLoaded(IReadOnlyDictionary<string, IReadOnlyList<string>> SiteContacts);
|
||||
|
||||
/// <summary>
|
||||
/// Notification sent to debug view subscribers when the stream is terminated
|
||||
/// due to site disconnection (WP-5).
|
||||
/// </summary>
|
||||
public record DebugStreamTerminated(string SiteId, string CorrelationId);
|
||||
|
||||
/// <summary>
|
||||
/// Registers the central NotificationOutboxActor singleton proxy with the
|
||||
/// <see cref="CentralCommunicationActor"/> so site-forwarded <see cref="NotificationSubmit"/>
|
||||
/// and <see cref="NotificationStatusQuery"/> messages can be routed to it. Sent by the Host
|
||||
/// after the outbox singleton proxy is created.
|
||||
/// </summary>
|
||||
public record RegisterNotificationOutbox(IActorRef OutboxProxy);
|
||||
|
||||
/// <summary>
|
||||
/// Registers the central AuditLogIngestActor singleton proxy with the
|
||||
/// <see cref="CentralCommunicationActor"/> so site-forwarded
|
||||
/// <see cref="IngestAuditEventsCommand"/> and <see cref="IngestCachedTelemetryCommand"/>
|
||||
/// messages can be routed to it. Sent by the Host after the audit-ingest
|
||||
/// singleton proxy is created. Lives here (not in Commons) because
|
||||
/// <c>ZB.MOM.WW.ScadaBridge.Commons</c> has no Akka package reference and cannot hold an
|
||||
/// <see cref="IActorRef"/> field.
|
||||
/// </summary>
|
||||
public sealed record RegisterAuditIngest(IActorRef AuditIngestActor);
|
||||
@@ -0,0 +1,291 @@
|
||||
using Akka.Actor;
|
||||
using Akka.Event;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Messages.DebugView;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Streaming;
|
||||
using ZB.MOM.WW.ScadaBridge.Communication.Grpc;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.Communication.Actors;
|
||||
|
||||
/// <summary>
|
||||
/// Long-lived (one per active debug session) actor on the central side. Debug sessions
|
||||
/// are session-based and temporary — this actor holds no persisted state and does not
|
||||
/// derive from an Akka.Persistence base class; its state does not survive a restart.
|
||||
/// Sends SubscribeDebugViewRequest to the site via CentralCommunicationActor (with THIS actor
|
||||
/// as the Sender) to get the initial snapshot. After receiving the snapshot, opens a gRPC
|
||||
/// server-streaming subscription via SiteStreamGrpcClient for ongoing events.
|
||||
/// Stream events are marshalled back to the actor via Self.Tell for thread safety.
|
||||
/// </summary>
|
||||
public class DebugStreamBridgeActor : ReceiveActor, IWithTimers
|
||||
{
|
||||
private readonly ILoggingAdapter _log = Context.GetLogger();
|
||||
private readonly string _siteIdentifier;
|
||||
private readonly string _instanceUniqueName;
|
||||
private readonly string _correlationId;
|
||||
private readonly IActorRef _centralCommunicationActor;
|
||||
private readonly Action<object> _onEvent;
|
||||
private readonly Action _onTerminated;
|
||||
private readonly SiteStreamGrpcClientFactory _grpcFactory;
|
||||
private readonly string _grpcNodeAAddress;
|
||||
private readonly string _grpcNodeBAddress;
|
||||
|
||||
private const int MaxRetries = 3;
|
||||
private const string ReconnectTimerKey = "grpc-reconnect";
|
||||
private const string StabilityTimerKey = "grpc-stability";
|
||||
/// <summary>Delay between gRPC reconnection attempts.</summary>
|
||||
internal static TimeSpan ReconnectDelay { get; set; } = TimeSpan.FromSeconds(5);
|
||||
|
||||
/// <summary>
|
||||
/// How long a freshly-opened gRPC stream must stay up before its retry budget
|
||||
/// is considered "recovered" and <see cref="_retryCount"/> is reset to 0.
|
||||
/// Communication-008: the retry count must NOT be reset by individual events —
|
||||
/// a stream that connects, delivers one event, then fails repeatedly would
|
||||
/// otherwise reconnect forever and never trip <see cref="MaxRetries"/>. Resetting
|
||||
/// only after a stable interval bounds a flapping stream.
|
||||
/// </summary>
|
||||
internal static TimeSpan StabilityWindow { get; set; } = TimeSpan.FromSeconds(60);
|
||||
|
||||
private int _retryCount;
|
||||
private bool _useNodeA = true;
|
||||
private bool _stopped;
|
||||
private CancellationTokenSource? _grpcCts;
|
||||
|
||||
/// <summary>Timer scheduler for reconnect and stability window timers.</summary>
|
||||
public ITimerScheduler Timers { get; set; } = null!;
|
||||
|
||||
/// <summary>
|
||||
/// Initializes the debug stream bridge actor and registers message handlers.
|
||||
/// </summary>
|
||||
/// <param name="siteIdentifier">Site identifier for targeting ClusterClient messages and logging.</param>
|
||||
/// <param name="instanceUniqueName">Unique name of the instance whose debug stream is being bridged.</param>
|
||||
/// <param name="correlationId">Correlation id for the debug session.</param>
|
||||
/// <param name="centralCommunicationActor">Actor used to forward ClusterClient messages to the site.</param>
|
||||
/// <param name="onEvent">Callback invoked on each received debug event.</param>
|
||||
/// <param name="onTerminated">Callback invoked when the stream terminates.</param>
|
||||
/// <param name="grpcFactory">Factory for creating gRPC streaming clients.</param>
|
||||
/// <param name="grpcNodeAAddress">gRPC address of the site's node A.</param>
|
||||
/// <param name="grpcNodeBAddress">gRPC address of the site's node B.</param>
|
||||
public DebugStreamBridgeActor(
|
||||
string siteIdentifier,
|
||||
string instanceUniqueName,
|
||||
string correlationId,
|
||||
IActorRef centralCommunicationActor,
|
||||
Action<object> onEvent,
|
||||
Action onTerminated,
|
||||
SiteStreamGrpcClientFactory grpcFactory,
|
||||
string grpcNodeAAddress,
|
||||
string grpcNodeBAddress)
|
||||
{
|
||||
_siteIdentifier = siteIdentifier;
|
||||
_instanceUniqueName = instanceUniqueName;
|
||||
_correlationId = correlationId;
|
||||
_centralCommunicationActor = centralCommunicationActor;
|
||||
_onEvent = onEvent;
|
||||
_onTerminated = onTerminated;
|
||||
_grpcFactory = grpcFactory;
|
||||
_grpcNodeAAddress = grpcNodeAAddress;
|
||||
_grpcNodeBAddress = grpcNodeBAddress;
|
||||
|
||||
// Initial snapshot response from the site (via ClusterClient)
|
||||
Receive<DebugViewSnapshot>(snapshot =>
|
||||
{
|
||||
_log.Info("Received initial snapshot for {0} ({1} attrs, {2} alarms)",
|
||||
_instanceUniqueName, snapshot.AttributeValues.Count, snapshot.AlarmStates.Count);
|
||||
_onEvent(snapshot);
|
||||
OpenGrpcStream();
|
||||
});
|
||||
|
||||
// Domain events arriving via Self.Tell from gRPC callback.
|
||||
// Communication-008: receiving an event must NOT reset _retryCount — a
|
||||
// flapping stream that delivers a single event between failures would
|
||||
// otherwise never trip MaxRetries. The retry budget is recovered only by
|
||||
// GrpcStreamStable (a stream that has stayed up for StabilityWindow).
|
||||
Receive<AttributeValueChanged>(changed => _onEvent(changed));
|
||||
Receive<AlarmStateChanged>(changed => _onEvent(changed));
|
||||
|
||||
// Stream has been stably connected for StabilityWindow — recover the
|
||||
// retry budget so a future transient fault gets a fresh set of retries.
|
||||
Receive<GrpcStreamStable>(_ =>
|
||||
{
|
||||
if (_stopped) return;
|
||||
_retryCount = 0;
|
||||
_log.Debug("gRPC stream for {0} stable, retry count reset", _instanceUniqueName);
|
||||
});
|
||||
|
||||
// gRPC stream error — attempt reconnection
|
||||
Receive<GrpcStreamError>(msg =>
|
||||
{
|
||||
_log.Warning("gRPC stream error for {0}: {1}", _instanceUniqueName, msg.Exception.Message);
|
||||
HandleGrpcError();
|
||||
});
|
||||
|
||||
// Scheduled reconnection
|
||||
Receive<ReconnectGrpcStream>(_ => OpenGrpcStream());
|
||||
|
||||
// Consumer requests stop
|
||||
Receive<StopDebugStream>(_ =>
|
||||
{
|
||||
_log.Info("Stopping debug stream for {0}", _instanceUniqueName);
|
||||
CleanupGrpc();
|
||||
SendUnsubscribe();
|
||||
_stopped = true;
|
||||
Context.Stop(Self);
|
||||
});
|
||||
|
||||
// Site disconnected — CentralCommunicationActor notifies us
|
||||
Receive<DebugStreamTerminated>(msg =>
|
||||
{
|
||||
if (_stopped) return; // Idempotent — gRPC error may arrive simultaneously
|
||||
_log.Warning("Debug stream terminated for {0} (site {1} disconnected)", _instanceUniqueName, msg.SiteId);
|
||||
CleanupGrpc();
|
||||
_stopped = true;
|
||||
_onTerminated();
|
||||
Context.Stop(Self);
|
||||
});
|
||||
|
||||
// Orphan safety net — if nobody stops us within 5 minutes, self-terminate
|
||||
Context.SetReceiveTimeout(TimeSpan.FromMinutes(5));
|
||||
Receive<ReceiveTimeout>(_ =>
|
||||
{
|
||||
_log.Warning("Debug stream for {0} timed out (orphaned session), stopping", _instanceUniqueName);
|
||||
CleanupGrpc();
|
||||
SendUnsubscribe();
|
||||
_stopped = true;
|
||||
_onTerminated();
|
||||
Context.Stop(Self);
|
||||
});
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
protected override void PreStart()
|
||||
{
|
||||
_log.Info("Starting debug stream bridge for {0} on site {1}", _instanceUniqueName, _siteIdentifier);
|
||||
|
||||
// Send subscribe request via CentralCommunicationActor for the initial snapshot.
|
||||
var request = new SubscribeDebugViewRequest(_instanceUniqueName, _correlationId);
|
||||
var envelope = new SiteEnvelope(_siteIdentifier, request);
|
||||
_centralCommunicationActor.Tell(envelope, Self);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
protected override void PostStop()
|
||||
{
|
||||
_grpcCts?.Cancel();
|
||||
_grpcCts?.Dispose();
|
||||
_grpcCts = null;
|
||||
base.PostStop();
|
||||
}
|
||||
|
||||
private void OpenGrpcStream()
|
||||
{
|
||||
if (_stopped) return;
|
||||
|
||||
var endpoint = _useNodeA ? _grpcNodeAAddress : _grpcNodeBAddress;
|
||||
_log.Info("Opening gRPC stream for {0} to {1}", _instanceUniqueName, endpoint);
|
||||
|
||||
_grpcCts?.Cancel();
|
||||
_grpcCts?.Dispose();
|
||||
_grpcCts = new CancellationTokenSource();
|
||||
|
||||
// Arm the stability timer: if the stream stays up for StabilityWindow the
|
||||
// retry budget is recovered (Communication-008). Cancelled by HandleGrpcError.
|
||||
Timers.StartSingleTimer(StabilityTimerKey, new GrpcStreamStable(), StabilityWindow);
|
||||
|
||||
var client = _grpcFactory.GetOrCreate(_siteIdentifier, endpoint);
|
||||
var self = Self;
|
||||
var ct = _grpcCts.Token;
|
||||
|
||||
// Launch as background task — onEvent and onError marshal back to actor via Tell
|
||||
Task.Run(async () =>
|
||||
{
|
||||
await client.SubscribeAsync(
|
||||
_correlationId,
|
||||
_instanceUniqueName,
|
||||
evt => self.Tell(evt),
|
||||
ex => self.Tell(new GrpcStreamError(ex)),
|
||||
ct);
|
||||
}, ct);
|
||||
}
|
||||
|
||||
private void HandleGrpcError()
|
||||
{
|
||||
if (_stopped) return;
|
||||
|
||||
// The stream failed before reaching the stability window — its retry
|
||||
// budget is NOT recovered (Communication-008).
|
||||
Timers.Cancel(StabilityTimerKey);
|
||||
|
||||
_retryCount++;
|
||||
|
||||
if (_retryCount > MaxRetries)
|
||||
{
|
||||
_log.Error("gRPC stream for {0} exceeded max retries ({1}), terminating", _instanceUniqueName, MaxRetries);
|
||||
CleanupGrpc();
|
||||
_stopped = true;
|
||||
_onTerminated();
|
||||
Context.Stop(Self);
|
||||
return;
|
||||
}
|
||||
|
||||
// Unsubscribe the failed stream on the *previous* endpoint before reconnecting.
|
||||
// This cancels the local subscription CTS and -- where the channel is still
|
||||
// alive -- propagates gRPC cancellation to the site so its SiteStreamGrpcServer
|
||||
// stops the StreamRelayActor for this correlation ID, rather than leaving a
|
||||
// zombie relay actor until TCP RST / keepalive eventually detects the loss.
|
||||
var previousEndpoint = _useNodeA ? _grpcNodeAAddress : _grpcNodeBAddress;
|
||||
var previousClient = _grpcFactory.GetOrCreate(_siteIdentifier, previousEndpoint);
|
||||
previousClient.Unsubscribe(_correlationId);
|
||||
|
||||
// Flip to the other node
|
||||
_useNodeA = !_useNodeA;
|
||||
|
||||
// First retry is immediate, subsequent retries use a short backoff
|
||||
if (_retryCount == 1)
|
||||
{
|
||||
Self.Tell(new ReconnectGrpcStream());
|
||||
}
|
||||
else
|
||||
{
|
||||
Timers.StartSingleTimer(ReconnectTimerKey, new ReconnectGrpcStream(), ReconnectDelay);
|
||||
}
|
||||
}
|
||||
|
||||
private void CleanupGrpc()
|
||||
{
|
||||
_grpcCts?.Cancel();
|
||||
_grpcCts?.Dispose();
|
||||
_grpcCts = null;
|
||||
|
||||
var client = _grpcFactory.GetOrCreate(_siteIdentifier,
|
||||
_useNodeA ? _grpcNodeAAddress : _grpcNodeBAddress);
|
||||
client.Unsubscribe(_correlationId);
|
||||
}
|
||||
|
||||
private void SendUnsubscribe()
|
||||
{
|
||||
var request = new UnsubscribeDebugViewRequest(_instanceUniqueName, _correlationId);
|
||||
var envelope = new SiteEnvelope(_siteIdentifier, request);
|
||||
_centralCommunicationActor.Tell(envelope, Self);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Message sent to a DebugStreamBridgeActor to stop the debug stream session.
|
||||
/// </summary>
|
||||
public record StopDebugStream;
|
||||
|
||||
/// <summary>
|
||||
/// Internal message indicating a gRPC stream error occurred.
|
||||
/// </summary>
|
||||
internal record GrpcStreamError(Exception Exception);
|
||||
|
||||
/// <summary>
|
||||
/// Internal message to trigger gRPC stream reconnection.
|
||||
/// </summary>
|
||||
internal record ReconnectGrpcStream;
|
||||
|
||||
/// <summary>
|
||||
/// Internal message indicating the current gRPC stream has been connected long
|
||||
/// enough (<see cref="DebugStreamBridgeActor.StabilityWindow"/>) to be considered
|
||||
/// stable, so the reconnect retry budget can be recovered.
|
||||
/// </summary>
|
||||
internal record GrpcStreamStable;
|
||||
@@ -0,0 +1,457 @@
|
||||
using Akka.Actor;
|
||||
using Akka.Cluster;
|
||||
using Akka.Cluster.Tools.Client;
|
||||
using Akka.Event;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Artifacts;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Audit;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Messages.DebugView;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Deployment;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Health;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Messages.InboundApi;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Integration;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Lifecycle;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Notification;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Messages.RemoteQuery;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.Communication.Actors;
|
||||
|
||||
/// <summary>
|
||||
/// Site-side actor that receives messages from central via ClusterClient and routes
|
||||
/// them to the appropriate local actors. Also sends heartbeats and health reports
|
||||
/// to central via the registered ClusterClient.
|
||||
///
|
||||
/// WP-4: Routes all 8 message patterns to local handlers.
|
||||
/// </summary>
|
||||
public class SiteCommunicationActor : ReceiveActor, IWithTimers
|
||||
{
|
||||
private readonly ILoggingAdapter _log = Context.GetLogger();
|
||||
private readonly string _siteId;
|
||||
private readonly CommunicationOptions _options;
|
||||
|
||||
/// <summary>
|
||||
/// Communication-018: predicate that returns <c>true</c> when this node is
|
||||
/// the active member of the local site cluster (used to stamp
|
||||
/// <see cref="HeartbeatMessage.IsActive"/>). Production builds default to
|
||||
/// the Akka <see cref="Cluster"/> leader check; tests inject a stub so they
|
||||
/// do not need a real cluster.
|
||||
/// </summary>
|
||||
private readonly Func<bool> _isActiveCheck;
|
||||
|
||||
/// <summary>
|
||||
/// Reference to the local Deployment Manager singleton proxy.
|
||||
/// </summary>
|
||||
private readonly IActorRef _deploymentManagerProxy;
|
||||
|
||||
/// <summary>
|
||||
/// ClusterClient reference for sending messages to the central cluster.
|
||||
/// Set via RegisterCentralClient message.
|
||||
/// </summary>
|
||||
private IActorRef? _centralClient;
|
||||
|
||||
/// <summary>
|
||||
/// Local actor references for routing specific message patterns.
|
||||
/// Populated via registration messages.
|
||||
/// </summary>
|
||||
private IActorRef? _eventLogHandler;
|
||||
private IActorRef? _parkedMessageHandler;
|
||||
private IActorRef? _integrationHandler;
|
||||
private IActorRef? _artifactHandler;
|
||||
|
||||
/// <summary>Akka timer scheduler injected by the framework via <see cref="IWithTimers"/>.</summary>
|
||||
public ITimerScheduler Timers { get; set; } = null!;
|
||||
|
||||
/// <summary>Initializes the actor, wires all message pattern handlers, and schedules the periodic heartbeat.</summary>
|
||||
/// <param name="siteId">The site identifier included in outbound messages.</param>
|
||||
/// <param name="options">Communication options including heartbeat interval and transport settings.</param>
|
||||
/// <param name="deploymentManagerProxy">Local reference to the Deployment Manager singleton proxy.</param>
|
||||
/// <param name="isActiveCheck">
|
||||
/// Communication-018: optional override returning <c>true</c> when this node
|
||||
/// is the active member of the site cluster. <c>null</c> uses the real
|
||||
/// Akka <see cref="Cluster"/> leader check (the default for production
|
||||
/// wiring); tests pass a stub so they do not need to load Akka.Cluster
|
||||
/// into the <c>TestKit</c> ActorSystem.
|
||||
/// </param>
|
||||
public SiteCommunicationActor(
|
||||
string siteId,
|
||||
CommunicationOptions options,
|
||||
IActorRef deploymentManagerProxy,
|
||||
Func<bool>? isActiveCheck = null)
|
||||
{
|
||||
_siteId = siteId;
|
||||
_options = options;
|
||||
_deploymentManagerProxy = deploymentManagerProxy;
|
||||
_isActiveCheck = isActiveCheck ?? DefaultIsActiveCheck;
|
||||
|
||||
// Registration
|
||||
Receive<RegisterCentralClient>(msg =>
|
||||
{
|
||||
_centralClient = msg.Client;
|
||||
_log.Info("Registered central ClusterClient");
|
||||
});
|
||||
Receive<RegisterLocalHandler>(HandleRegisterLocalHandler);
|
||||
|
||||
// Pattern 1: Instance Deployment — forward to Deployment Manager
|
||||
Receive<DeployInstanceCommand>(msg =>
|
||||
{
|
||||
_log.Debug("Routing DeployInstanceCommand for {0} to DeploymentManager", msg.InstanceUniqueName);
|
||||
_deploymentManagerProxy.Forward(msg);
|
||||
});
|
||||
|
||||
// Pattern 2: Lifecycle — forward to Deployment Manager
|
||||
Receive<DisableInstanceCommand>(msg => _deploymentManagerProxy.Forward(msg));
|
||||
Receive<EnableInstanceCommand>(msg => _deploymentManagerProxy.Forward(msg));
|
||||
Receive<DeleteInstanceCommand>(msg => _deploymentManagerProxy.Forward(msg));
|
||||
|
||||
// DeploymentManager-006: query-the-site-before-redeploy — forward to
|
||||
// the Deployment Manager, which owns the deployed-config store and
|
||||
// answers with the instance's currently-applied deployment identity.
|
||||
Receive<DeploymentStateQueryRequest>(msg => _deploymentManagerProxy.Forward(msg));
|
||||
|
||||
// Pattern 3: Artifact Deployment — forward to artifact handler if registered
|
||||
Receive<DeployArtifactsCommand>(msg =>
|
||||
{
|
||||
if (_artifactHandler != null)
|
||||
_artifactHandler.Forward(msg);
|
||||
else
|
||||
{
|
||||
_log.Warning("No artifact handler registered, replying with failure");
|
||||
Sender.Tell(new ArtifactDeploymentResponse(
|
||||
msg.DeploymentId, _siteId, false, "Artifact handler not available", DateTimeOffset.UtcNow));
|
||||
}
|
||||
});
|
||||
|
||||
// Pattern 4: Integration Routing — forward to integration handler
|
||||
Receive<IntegrationCallRequest>(msg =>
|
||||
{
|
||||
if (_integrationHandler != null)
|
||||
_integrationHandler.Forward(msg);
|
||||
else
|
||||
{
|
||||
Sender.Tell(new IntegrationCallResponse(
|
||||
msg.CorrelationId, _siteId, false, null, "Integration handler not available", DateTimeOffset.UtcNow));
|
||||
}
|
||||
});
|
||||
|
||||
// Pattern 5: Debug View — forward to Deployment Manager (which routes to Instance Actor)
|
||||
Receive<SubscribeDebugViewRequest>(msg => _deploymentManagerProxy.Forward(msg));
|
||||
Receive<UnsubscribeDebugViewRequest>(msg => _deploymentManagerProxy.Forward(msg));
|
||||
|
||||
// Pattern 6a: Debug Snapshot (one-shot) — forward to Deployment Manager
|
||||
Receive<DebugSnapshotRequest>(msg => _deploymentManagerProxy.Forward(msg));
|
||||
|
||||
// Inbound API Route.To() — forward to Deployment Manager for instance routing
|
||||
Receive<RouteToCallRequest>(msg => _deploymentManagerProxy.Forward(msg));
|
||||
Receive<RouteToGetAttributesRequest>(msg => _deploymentManagerProxy.Forward(msg));
|
||||
Receive<RouteToSetAttributesRequest>(msg => _deploymentManagerProxy.Forward(msg));
|
||||
|
||||
// Pattern 7: Remote Queries
|
||||
Receive<EventLogQueryRequest>(msg =>
|
||||
{
|
||||
if (_eventLogHandler != null)
|
||||
_eventLogHandler.Forward(msg);
|
||||
else
|
||||
{
|
||||
Sender.Tell(new EventLogQueryResponse(
|
||||
msg.CorrelationId, _siteId, [], null, false, false,
|
||||
"Event log handler not available", DateTimeOffset.UtcNow));
|
||||
}
|
||||
});
|
||||
|
||||
Receive<ParkedMessageQueryRequest>(msg =>
|
||||
{
|
||||
if (_parkedMessageHandler != null)
|
||||
_parkedMessageHandler.Forward(msg);
|
||||
else
|
||||
{
|
||||
Sender.Tell(new ParkedMessageQueryResponse(
|
||||
msg.CorrelationId, _siteId, [], 0, msg.PageNumber, msg.PageSize, false,
|
||||
"Parked message handler not available", DateTimeOffset.UtcNow));
|
||||
}
|
||||
});
|
||||
|
||||
Receive<ParkedMessageRetryRequest>(msg =>
|
||||
{
|
||||
if (_parkedMessageHandler != null)
|
||||
_parkedMessageHandler.Forward(msg);
|
||||
else
|
||||
{
|
||||
Sender.Tell(new ParkedMessageRetryResponse(
|
||||
msg.CorrelationId, false, "Parked message handler not available"));
|
||||
}
|
||||
});
|
||||
|
||||
Receive<ParkedMessageDiscardRequest>(msg =>
|
||||
{
|
||||
if (_parkedMessageHandler != null)
|
||||
_parkedMessageHandler.Forward(msg);
|
||||
else
|
||||
{
|
||||
Sender.Tell(new ParkedMessageDiscardResponse(
|
||||
msg.CorrelationId, false, "Parked message handler not available"));
|
||||
}
|
||||
});
|
||||
|
||||
// Task 5 (#22): central→site Retry/Discard relay for parked cached
|
||||
// operations. SiteCallAuditActor relays these over the command/control
|
||||
// channel; the parked-message handler executes them against the local
|
||||
// S&F buffer and replies a ParkedOperationActionAck that routes back to
|
||||
// the relaying SiteCallAuditActor's Ask.
|
||||
Receive<RetryParkedOperation>(msg =>
|
||||
{
|
||||
if (_parkedMessageHandler != null)
|
||||
_parkedMessageHandler.Forward(msg);
|
||||
else
|
||||
{
|
||||
Sender.Tell(new ParkedOperationActionAck(
|
||||
msg.CorrelationId, Applied: false, "Parked message handler not available"));
|
||||
}
|
||||
});
|
||||
|
||||
Receive<DiscardParkedOperation>(msg =>
|
||||
{
|
||||
if (_parkedMessageHandler != null)
|
||||
_parkedMessageHandler.Forward(msg);
|
||||
else
|
||||
{
|
||||
Sender.Tell(new ParkedOperationActionAck(
|
||||
msg.CorrelationId, Applied: false, "Parked message handler not available"));
|
||||
}
|
||||
});
|
||||
|
||||
// Notification Outbox: forward a buffered notification submitted by the site
|
||||
// Store-and-Forward Engine to the central cluster. The original Sender (the
|
||||
// S&F forwarder's Ask) is forwarded as the ClusterClient.Send sender so the
|
||||
// NotificationSubmitAck routes straight back to the waiting Ask, not here.
|
||||
Receive<NotificationSubmit>(msg =>
|
||||
{
|
||||
if (_centralClient == null)
|
||||
{
|
||||
// No ClusterClient registered yet (e.g. central contact points not
|
||||
// configured, or registration not yet completed). A non-accepted ack
|
||||
// makes the S&F forwarder treat this as transient and retry later.
|
||||
_log.Warning(
|
||||
"Cannot forward NotificationSubmit {0} — no central ClusterClient registered",
|
||||
msg.NotificationId);
|
||||
Sender.Tell(new NotificationSubmitAck(
|
||||
msg.NotificationId, Accepted: false, Error: "Central ClusterClient not registered"));
|
||||
return;
|
||||
}
|
||||
|
||||
_log.Debug("Forwarding NotificationSubmit {0} to central", msg.NotificationId);
|
||||
_centralClient.Tell(
|
||||
new ClusterClient.Send("/user/central-communication", msg), Sender);
|
||||
});
|
||||
|
||||
// Notification Outbox: forward a Notify.Status query to the central cluster.
|
||||
// The original Sender (the Notify helper's Ask) is forwarded as the
|
||||
// ClusterClient.Send sender so the NotificationStatusResponse routes straight
|
||||
// back to the waiting Ask, not here.
|
||||
Receive<NotificationStatusQuery>(msg =>
|
||||
{
|
||||
if (_centralClient == null)
|
||||
{
|
||||
// No ClusterClient registered yet. Reply Found: false so Notify.Status
|
||||
// falls back to the site S&F buffer to decide Forwarding vs Unknown.
|
||||
_log.Warning(
|
||||
"Cannot forward NotificationStatusQuery {0} — no central ClusterClient registered",
|
||||
msg.NotificationId);
|
||||
Sender.Tell(new NotificationStatusResponse(
|
||||
msg.CorrelationId, Found: false, Status: "Unknown",
|
||||
RetryCount: 0, LastError: null, DeliveredAt: null));
|
||||
return;
|
||||
}
|
||||
|
||||
_log.Debug("Forwarding NotificationStatusQuery {0} to central", msg.NotificationId);
|
||||
_centralClient.Tell(
|
||||
new ClusterClient.Send("/user/central-communication", msg), Sender);
|
||||
});
|
||||
|
||||
// Audit Log (#23): forward a batch of site-local audit events to the
|
||||
// central cluster. The site SiteAuditTelemetryActor drains its SQLite
|
||||
// Pending queue through the ClusterClientSiteAuditClient, which Asks
|
||||
// this actor; the original Sender (that Ask) is passed as the
|
||||
// ClusterClient.Send sender so the IngestAuditEventsReply routes
|
||||
// straight back to the waiting Ask, not here. Mirrors NotificationSubmit.
|
||||
Receive<IngestAuditEventsCommand>(msg =>
|
||||
{
|
||||
if (_centralClient == null)
|
||||
{
|
||||
// No ClusterClient registered yet (e.g. central contact points
|
||||
// not configured, or registration not yet completed). Faulting
|
||||
// the Ask makes the SiteAuditTelemetryActor drain loop treat
|
||||
// this as transient and keep the rows Pending for the next tick.
|
||||
_log.Warning(
|
||||
"Cannot forward IngestAuditEventsCommand ({0} events) — no central ClusterClient registered",
|
||||
msg.Events.Count);
|
||||
Sender.Tell(new Status.Failure(
|
||||
new InvalidOperationException("Central ClusterClient not registered")));
|
||||
return;
|
||||
}
|
||||
|
||||
_log.Debug("Forwarding IngestAuditEventsCommand ({0} events) to central", msg.Events.Count);
|
||||
_centralClient.Tell(
|
||||
new ClusterClient.Send("/user/central-communication", msg), Sender);
|
||||
});
|
||||
|
||||
// Audit Log (#23) M3: forward a batch of combined cached-call telemetry
|
||||
// packets to the central cluster. Same forward + reply-routing pattern
|
||||
// as IngestAuditEventsCommand; central replies with an
|
||||
// IngestCachedTelemetryReply.
|
||||
Receive<IngestCachedTelemetryCommand>(msg =>
|
||||
{
|
||||
if (_centralClient == null)
|
||||
{
|
||||
_log.Warning(
|
||||
"Cannot forward IngestCachedTelemetryCommand ({0} entries) — no central ClusterClient registered",
|
||||
msg.Entries.Count);
|
||||
Sender.Tell(new Status.Failure(
|
||||
new InvalidOperationException("Central ClusterClient not registered")));
|
||||
return;
|
||||
}
|
||||
|
||||
_log.Debug("Forwarding IngestCachedTelemetryCommand ({0} entries) to central", msg.Entries.Count);
|
||||
_centralClient.Tell(
|
||||
new ClusterClient.Send("/user/central-communication", msg), Sender);
|
||||
});
|
||||
|
||||
// Internal: send heartbeat tick
|
||||
Receive<SendHeartbeat>(_ => SendHeartbeatToCentral());
|
||||
|
||||
// Internal: forward health report to central
|
||||
Receive<SiteHealthReport>(msg =>
|
||||
{
|
||||
_centralClient?.Tell(
|
||||
new ClusterClient.Send("/user/central-communication", msg), Self);
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
protected override SupervisorStrategy SupervisorStrategy()
|
||||
{
|
||||
return new OneForOneStrategy(
|
||||
maxNrOfRetries: -1,
|
||||
withinTimeRange: Timeout.InfiniteTimeSpan,
|
||||
decider: Decider.From(ex =>
|
||||
{
|
||||
_log.Warning(ex, "Child actor of SiteCommunicationActor faulted, resuming (state preserved)");
|
||||
return Directive.Resume;
|
||||
}));
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
protected override void PreStart()
|
||||
{
|
||||
_log.Info("SiteCommunicationActor started for site {0}", _siteId);
|
||||
|
||||
// Schedule periodic heartbeat to central
|
||||
Timers.StartPeriodicTimer(
|
||||
"heartbeat",
|
||||
new SendHeartbeat(),
|
||||
TimeSpan.FromSeconds(1), // initial delay
|
||||
_options.TransportHeartbeatInterval);
|
||||
}
|
||||
|
||||
private void HandleRegisterLocalHandler(RegisterLocalHandler msg)
|
||||
{
|
||||
switch (msg.HandlerType)
|
||||
{
|
||||
case LocalHandlerType.EventLog:
|
||||
_eventLogHandler = msg.Handler;
|
||||
break;
|
||||
case LocalHandlerType.ParkedMessages:
|
||||
_parkedMessageHandler = msg.Handler;
|
||||
break;
|
||||
case LocalHandlerType.Integration:
|
||||
_integrationHandler = msg.Handler;
|
||||
break;
|
||||
case LocalHandlerType.Artifacts:
|
||||
_artifactHandler = msg.Handler;
|
||||
break;
|
||||
}
|
||||
|
||||
_log.Info("Registered local handler for {0}", msg.HandlerType);
|
||||
}
|
||||
|
||||
private void SendHeartbeatToCentral()
|
||||
{
|
||||
if (_centralClient == null)
|
||||
return;
|
||||
|
||||
var hostname = Environment.MachineName;
|
||||
|
||||
// Communication-018: stamp HeartbeatMessage.IsActive with this node's
|
||||
// true active/standby role rather than hard-coding `true`. The field is
|
||||
// part of the wire contract (additive-only-evolution) so a future
|
||||
// central health dashboard can distinguish "active node down, standby
|
||||
// up" from "site fully offline" without a new message type.
|
||||
bool isActive;
|
||||
try
|
||||
{
|
||||
isActive = _isActiveCheck();
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
// Defensive: never let a cluster-state read failure abort the
|
||||
// heartbeat itself (heartbeats are health signal — their absence is
|
||||
// already meaningful). Fall back to the safest non-claiming value:
|
||||
// standby. Logged at Debug because this path normally only fires
|
||||
// during ActorSystem warm-up.
|
||||
_log.Debug(ex,
|
||||
"Active-node check threw while sending heartbeat for site {0}; reporting IsActive=false",
|
||||
_siteId);
|
||||
isActive = false;
|
||||
}
|
||||
|
||||
var heartbeat = new HeartbeatMessage(
|
||||
_siteId,
|
||||
hostname,
|
||||
IsActive: isActive,
|
||||
DateTimeOffset.UtcNow);
|
||||
|
||||
_centralClient.Tell(
|
||||
new ClusterClient.Send("/user/central-communication", heartbeat), Self);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Communication-018: default active-node check used when no override is
|
||||
/// supplied. Mirrors <c>ActiveNodeGate</c> in the Host (and
|
||||
/// <c>ActiveNodeHealthCheck</c>): the node is the active member of the
|
||||
/// site cluster when it is the current cluster leader AND its own
|
||||
/// <see cref="MemberStatus"/> is <see cref="MemberStatus.Up"/>. Any other
|
||||
/// state (still joining, leaving, no leader yet) reports standby —
|
||||
/// safe-by-default, matching the standby case.
|
||||
/// </summary>
|
||||
private bool DefaultIsActiveCheck()
|
||||
{
|
||||
var cluster = Cluster.Get(Context.System);
|
||||
var self = cluster.SelfMember;
|
||||
if (self.Status != MemberStatus.Up)
|
||||
return false;
|
||||
|
||||
var leader = cluster.State.Leader;
|
||||
return leader != null && leader == self.Address;
|
||||
}
|
||||
|
||||
// ── Internal messages ──
|
||||
|
||||
internal record SendHeartbeat;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Command to register a ClusterClient for communicating with the central cluster.
|
||||
/// </summary>
|
||||
public record RegisterCentralClient(IActorRef Client);
|
||||
|
||||
/// <summary>
|
||||
/// Command to register a local actor as a handler for a specific message pattern.
|
||||
/// </summary>
|
||||
public record RegisterLocalHandler(LocalHandlerType HandlerType, IActorRef Handler);
|
||||
|
||||
public enum LocalHandlerType
|
||||
{
|
||||
EventLog,
|
||||
ParkedMessages,
|
||||
Integration,
|
||||
Artifacts
|
||||
}
|
||||
@@ -0,0 +1,108 @@
|
||||
using System.Threading.Channels;
|
||||
using Akka.Actor;
|
||||
using Akka.Event;
|
||||
using Google.Protobuf.WellKnownTypes;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Streaming;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Types;
|
||||
using ZB.MOM.WW.ScadaBridge.Communication.Grpc;
|
||||
using AlarmState = ZB.MOM.WW.ScadaBridge.Commons.Types.Enums.AlarmState;
|
||||
using AlarmLevel = ZB.MOM.WW.ScadaBridge.Commons.Types.Enums.AlarmLevel;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.Communication.Actors;
|
||||
|
||||
/// <summary>
|
||||
/// Lightweight relay actor that bridges Akka domain events (AttributeValueChanged,
|
||||
/// AlarmStateChanged) to a System.Threading.Channels.Channel of protobuf SiteStreamEvent
|
||||
/// messages. The gRPC server method reads from the channel's reader side.
|
||||
/// </summary>
|
||||
public class StreamRelayActor : ReceiveActor
|
||||
{
|
||||
private readonly ILoggingAdapter _log = Context.GetLogger();
|
||||
private readonly string _correlationId;
|
||||
private readonly ChannelWriter<SiteStreamEvent> _channelWriter;
|
||||
|
||||
/// <summary>
|
||||
/// Initializes a new <see cref="StreamRelayActor"/> for the given gRPC stream correlation.
|
||||
/// </summary>
|
||||
/// <param name="correlationId">Correlation id stamped on every relayed <see cref="SiteStreamEvent"/>.</param>
|
||||
/// <param name="channelWriter">Channel writer to which converted events are written.</param>
|
||||
public StreamRelayActor(string correlationId, ChannelWriter<SiteStreamEvent> channelWriter)
|
||||
{
|
||||
_correlationId = correlationId;
|
||||
_channelWriter = channelWriter;
|
||||
|
||||
Receive<AttributeValueChanged>(HandleAttributeValueChanged);
|
||||
Receive<AlarmStateChanged>(HandleAlarmStateChanged);
|
||||
}
|
||||
|
||||
private void HandleAttributeValueChanged(AttributeValueChanged msg)
|
||||
{
|
||||
var protoEvent = new SiteStreamEvent
|
||||
{
|
||||
CorrelationId = _correlationId,
|
||||
AttributeChanged = new AttributeValueUpdate
|
||||
{
|
||||
InstanceUniqueName = msg.InstanceUniqueName,
|
||||
AttributePath = msg.AttributePath,
|
||||
AttributeName = msg.AttributeName,
|
||||
Value = ValueFormatter.FormatDisplayValue(msg.Value),
|
||||
Quality = MapQuality(msg.Quality),
|
||||
Timestamp = Timestamp.FromDateTimeOffset(msg.Timestamp)
|
||||
}
|
||||
};
|
||||
|
||||
WriteToChannel(protoEvent);
|
||||
}
|
||||
|
||||
private void HandleAlarmStateChanged(AlarmStateChanged msg)
|
||||
{
|
||||
var protoEvent = new SiteStreamEvent
|
||||
{
|
||||
CorrelationId = _correlationId,
|
||||
AlarmChanged = new AlarmStateUpdate
|
||||
{
|
||||
InstanceUniqueName = msg.InstanceUniqueName,
|
||||
AlarmName = msg.AlarmName,
|
||||
State = MapAlarmState(msg.State),
|
||||
Priority = msg.Priority,
|
||||
Timestamp = Timestamp.FromDateTimeOffset(msg.Timestamp),
|
||||
Level = MapAlarmLevel(msg.Level),
|
||||
Message = msg.Message ?? string.Empty
|
||||
}
|
||||
};
|
||||
|
||||
WriteToChannel(protoEvent);
|
||||
}
|
||||
|
||||
private void WriteToChannel(SiteStreamEvent protoEvent)
|
||||
{
|
||||
if (!_channelWriter.TryWrite(protoEvent))
|
||||
{
|
||||
_log.Warning("Channel full, dropping event for correlation {0}", _correlationId);
|
||||
}
|
||||
}
|
||||
|
||||
private static Quality MapQuality(string quality) => quality switch
|
||||
{
|
||||
"Good" => Quality.Good,
|
||||
"Uncertain" => Quality.Uncertain,
|
||||
"Bad" => Quality.Bad,
|
||||
_ => Quality.Unspecified
|
||||
};
|
||||
|
||||
private static AlarmStateEnum MapAlarmState(AlarmState state) => state switch
|
||||
{
|
||||
AlarmState.Normal => AlarmStateEnum.AlarmStateNormal,
|
||||
AlarmState.Active => AlarmStateEnum.AlarmStateActive,
|
||||
_ => AlarmStateEnum.AlarmStateUnspecified
|
||||
};
|
||||
|
||||
private static AlarmLevelEnum MapAlarmLevel(AlarmLevel level) => level switch
|
||||
{
|
||||
AlarmLevel.Low => AlarmLevelEnum.AlarmLevelLow,
|
||||
AlarmLevel.LowLow => AlarmLevelEnum.AlarmLevelLowLow,
|
||||
AlarmLevel.High => AlarmLevelEnum.AlarmLevelHigh,
|
||||
AlarmLevel.HighHigh => AlarmLevelEnum.AlarmLevelHighHigh,
|
||||
_ => AlarmLevelEnum.AlarmLevelNone
|
||||
};
|
||||
}
|
||||
Reference in New Issue
Block a user