refactor: rename ScadaLink → ZB.MOM.WW.ScadaBridge (code + projects + namespaces)

Solution + 23 src projects + 26 test projects renamed; folders, csproj, namespaces, and ScadaLinkDbContext/ScadaBridgeDbContext class updated. ActorSystem "scadalink" → "scadabridge", Akka seed-node URLs migrated. SQL roles/logins, LDAP domains, CLI command name, and CLI config dir (~/.scadalink → ~/.scadabridge) also renamed. Build green; 5 Host.Tests fail awaiting SQL login rename in next commit. Pre-existing StaleTagMonitor timing flakes unchanged. Rename script committed at tools/rename-to-scadabridge.sh.
2026-05-28 09:37:45 -04:00
parent 6d87ee3c3b
commit 7b0b9c7365
1531 changed files with 11180 additions and 11054 deletions
@@ -0,0 +1,582 @@
+using System.Collections.Immutable;
+using Akka.Actor;
+using Akka.Cluster.Tools.Client;
+using Akka.Cluster.Tools.PublishSubscribe;
+using Akka.Event;
+using Microsoft.Extensions.DependencyInjection;
+using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Repositories;
+using ZB.MOM.WW.ScadaBridge.Commons.Messages.Audit;
+using ZB.MOM.WW.ScadaBridge.Commons.Messages.Communication;
+using ZB.MOM.WW.ScadaBridge.Commons.Messages.Health;
+using ZB.MOM.WW.ScadaBridge.Commons.Messages.Notification;
+using ZB.MOM.WW.ScadaBridge.HealthMonitoring;
+
+namespace ZB.MOM.WW.ScadaBridge.Communication.Actors;
+
+/// <summary>
+/// Abstraction for creating ClusterClient instances per site, enabling testability.
+/// </summary>
+public interface ISiteClientFactory
+{
+    /// <summary>Creates a ClusterClient actor for the given site with the specified contact points.</summary>
+    /// <param name="system">The actor system in which to create the client.</param>
+    /// <param name="siteId">The site identifier, used to name the actor.</param>
+    /// <param name="contacts">The set of receptionist actor paths to use as initial contacts.</param>
+    /// <returns>An actor reference for the new ClusterClient.</returns>
+    IActorRef Create(ActorSystem system, string siteId, ImmutableHashSet<ActorPath> contacts);
+}
+
+/// <summary>
+/// Default implementation that creates a real ClusterClient for each site.
+/// </summary>
+public class DefaultSiteClientFactory : ISiteClientFactory
+{
+    /// <inheritdoc />
+    public IActorRef Create(ActorSystem system, string siteId, ImmutableHashSet<ActorPath> contacts)
+    {
+        var settings = ClusterClientSettings.Create(system).WithInitialContacts(contacts);
+        return system.ActorOf(ClusterClient.Props(settings), $"site-client-{siteId}");
+    }
+}
+
+/// <summary>
+/// Central-side actor that routes messages from central to site clusters via ClusterClient.
+/// Resolves site addresses from the database on a periodic refresh cycle and manages
+/// per-site ClusterClient instances.
+///
+/// WP-4: All 8 message patterns routed through this actor.
+/// WP-5: Ask timeout on connection drop (no central buffering). Debug streams killed on interruption.
+/// </summary>
+public class CentralCommunicationActor : ReceiveActor
+{
+    private readonly ILoggingAdapter _log = Context.GetLogger();
+    private readonly IServiceProvider _serviceProvider;
+    private readonly ISiteClientFactory _siteClientFactory;
+
+    /// <summary>
+    /// Per-site ClusterClient instances and their contact addresses.
+    /// Maps SiteIdentifier → (ClusterClient actor, set of contact address strings).
+    /// Refreshed periodically via RefreshSiteAddresses.
+    /// </summary>
+    private readonly Dictionary<string, (IActorRef Client, ImmutableHashSet<string> ContactAddresses)> _siteClients = new();
+
+    // Communication-016: the previous _debugSubscriptions / _inProgressDeployments
+    // dictionaries existed solely to support a documented "synchronous kill streams +
+    // mark deployments failed on site disconnect" workflow triggered by
+    // ConnectionStateChanged. No production code ever emitted that message — only
+    // the unit test did — so the workflow was dead from end to end. Disconnect
+    // detection is owned by the underlying transports: the gRPC keepalive PING
+    // signals stream interruption in ~25s (handled by DebugStreamBridgeActor's own
+    // reconnection logic), and an Ask round-trip for a deploy times out at the
+    // CommunicationService layer (caller sees failure). The tracking dicts +
+    // ConnectionStateChanged record + HandleConnectionStateChanged handler are
+    // removed; see docs/requirements/Component-Communication.md "Connection
+    // Failure Behavior" for the keepalive-based contract that survives.
+
+    private ICancelable? _refreshSchedule;
+
+    /// <summary>
+    /// Communication-019: per-actor lifecycle CTS threaded into the periodic
+    /// <see cref="LoadSiteAddressesFromDb"/> repository call so a hung MS SQL
+    /// connection is bounded by actor shutdown rather than holding piped tasks
+    /// open indefinitely. Cancelled in <see cref="PostStop"/>; never reset.
+    /// </summary>
+    private readonly CancellationTokenSource _lifecycleCts = new();
+
+    /// <summary>
+    /// Proxy <see cref="IActorRef"/> for the central NotificationOutboxActor cluster singleton.
+    /// Set via <see cref="RegisterNotificationOutbox"/> — the Host creates the singleton proxy
+    /// after this actor and registers it (mirrors how the site-side actor receives its
+    /// runtime <see cref="IActorRef"/>s). Null until registration completes; a notification
+    /// arriving before then is rejected with a non-accepted ack so the site retries.
+    /// </summary>
+    private IActorRef? _notificationOutboxProxy;
+
+    /// <summary>
+    /// Proxy <see cref="IActorRef"/> for the central AuditLogIngestActor cluster
+    /// singleton. Set via <see cref="RegisterAuditIngest"/> — the Host creates the
+    /// singleton proxy after this actor and registers it (mirrors
+    /// <see cref="_notificationOutboxProxy"/>). Null until registration completes;
+    /// an audit ingest command arriving before then is answered with an empty
+    /// reply so the site keeps its rows Pending and retries.
+    ///
+    /// Once registered, the handler Asks this proxy and pipes the reply straight
+    /// back to the caller. On an Ask timeout or a faulted reply, PipeTo forwards a
+    /// <see cref="Status.Failure"/> to the caller — the fault propagates rather
+    /// than being swallowed. This differs from the gRPC handler
+    /// (<c>SiteStreamGrpcServer</c>), which catches the exception and returns an
+    /// empty ack; here the faulted Ask is the transient signal the site relies on
+    /// (see <see cref="HandleIngestAuditEvents"/>).
+    /// </summary>
+    private IActorRef? _auditIngestProxy;
+
+    /// <summary>
+    /// Default Ask timeout for routing audit ingest commands to the
+    /// AuditLogIngestActor proxy — 30 s, matching the value of
+    /// <c>SiteStreamGrpcServer.AuditIngestAskTimeout</c> (that constant is private
+    /// to the gRPC server and not reachable here, so it is declared locally). A
+    /// generous window absorbs a slow MS SQL connection without the round-trip
+    /// surfacing as a failure on a healthy site. When the window is exceeded the
+    /// Ask faults and that fault is piped back to the caller as a
+    /// <see cref="Status.Failure"/> (see <see cref="HandleIngestAuditEvents"/>).
+    /// </summary>
+    private static readonly TimeSpan DefaultAuditIngestAskTimeout = TimeSpan.FromSeconds(30);
+
+    /// <summary>
+    /// Effective Ask timeout for audit ingest routing. Defaults to
+    /// <see cref="DefaultAuditIngestAskTimeout"/>; overridable via the constructor
+    /// so tests can exercise the timeout/fault path without waiting 30 s.
+    /// </summary>
+    private readonly TimeSpan _auditIngestAskTimeout;
+
+    /// <summary>
+    /// DistributedPubSub topic used to fan health reports out to the peer
+    /// central node so both per-node aggregators stay in sync. See
+    /// <see cref="SiteHealthReportReplica"/> for the protocol rationale.
+    /// </summary>
+    private const string HealthReportTopic = "site-health-replica";
+
+    /// <summary>Initializes the <see cref="CentralCommunicationActor"/> and wires all message handlers.</summary>
+    /// <param name="serviceProvider">DI service provider for scoped repository and aggregator access.</param>
+    /// <param name="siteClientFactory">Factory used to create per-site ClusterClient actors.</param>
+    /// <param name="auditIngestAskTimeout">
+    /// Optional override for the audit-ingest Ask timeout; defaults to
+    /// <see cref="DefaultAuditIngestAskTimeout"/> (30 s). Exists only so tests can
+    /// exercise the timeout/fault path quickly — production always uses the default.
+    /// </param>
+    public CentralCommunicationActor(
+        IServiceProvider serviceProvider,
+        ISiteClientFactory siteClientFactory,
+        TimeSpan? auditIngestAskTimeout = null)
+    {
+        _serviceProvider = serviceProvider;
+        _siteClientFactory = siteClientFactory;
+        _auditIngestAskTimeout = auditIngestAskTimeout ?? DefaultAuditIngestAskTimeout;
+
+        // Site address cache loaded from database
+        Receive<SiteAddressCacheLoaded>(HandleSiteAddressCacheLoaded);
+
+        // Periodic refresh trigger
+        Receive<RefreshSiteAddresses>(_ => LoadSiteAddressesFromDb());
+
+        // Communication-006: a faulted LoadSiteAddressesFromDb task is piped here as a
+        // Status.Failure. Without this handler the failure was an unhandled message
+        // (debug-level only) and the refresh failed silently — operators could not
+        // distinguish "no sites configured" from "database is down". Log at Warning.
+        Receive<Status.Failure>(failure =>
+            _log.Warning(failure.Cause,
+                "Failed to load site addresses from the database; the site ClusterClient "
+                + "cache was not refreshed and may be stale or empty"));
+
+        // Health monitoring: heartbeats and health reports from sites
+        Receive<HeartbeatMessage>(HandleHeartbeat);
+        Receive<SiteHealthReport>(HandleSiteHealthReport);
+        Receive<SiteHealthReportReplica>(r => ProcessLocally(r.Report));
+        Receive<SubscribeAck>(_ => { /* DistributedPubSub subscribe confirmation */ });
+
+        // Route enveloped messages to sites
+        Receive<SiteEnvelope>(HandleSiteEnvelope);
+
+        // Notification Outbox: the Host registers the outbox singleton proxy after this
+        // actor is created (the proxy cannot exist before this actor's construction).
+        Receive<RegisterNotificationOutbox>(msg =>
+        {
+            _notificationOutboxProxy = msg.OutboxProxy;
+            _log.Info("Registered notification outbox proxy");
+        });
+
+        // Notification Outbox ingest: a site forwards a buffered NotificationSubmit to the
+        // central cluster via ClusterClient. Forward to the outbox proxy so the original
+        // Sender (the site's ClusterClient path) is preserved and the NotificationSubmitAck
+        // routes straight back to the site.
+        Receive<NotificationSubmit>(HandleNotificationSubmit);
+
+        // Notification Outbox status query: forward to the outbox proxy, preserving Sender
+        // so the NotificationStatusResponse routes back to the querying site.
+        Receive<NotificationStatusQuery>(HandleNotificationStatusQuery);
+
+        // Audit Log (#23): the Host registers the AuditLogIngestActor singleton
+        // proxy after this actor is created (the proxy cannot exist before this
+        // actor's construction).
+        Receive<RegisterAuditIngest>(msg =>
+        {
+            _auditIngestProxy = msg.AuditIngestActor;
+            _log.Info("Registered audit ingest proxy");
+        });
+
+        // Audit Log (#23) site→central ingest: a site forwards a batch of audit
+        // events to the central cluster via ClusterClient. Ask the ingest proxy
+        // and pipe the IngestAuditEventsReply back to the original Sender (the
+        // site's ClusterClient path) so the site can flip its rows to Forwarded.
+        Receive<IngestAuditEventsCommand>(HandleIngestAuditEvents);
+
+        // Audit Log (#23 M3) combined-telemetry ingest: routes to the same proxy
+        // the same way; the proxy replies with an IngestCachedTelemetryReply.
+        Receive<IngestCachedTelemetryCommand>(HandleIngestCachedTelemetry);
+    }
+
+    private void HandleNotificationSubmit(NotificationSubmit msg)
+    {
+        if (_notificationOutboxProxy == null)
+        {
+            // No outbox proxy registered yet. A non-accepted ack makes the site's
+            // Store-and-Forward forwarder treat this as transient and retry later.
+            _log.Warning(
+                "Cannot route NotificationSubmit {0} — notification outbox not available",
+                msg.NotificationId);
+            Sender.Tell(new NotificationSubmitAck(
+                msg.NotificationId, Accepted: false, Error: "notification outbox not available"));
+            return;
+        }
+
+        _log.Debug("Routing NotificationSubmit {0} to the notification outbox", msg.NotificationId);
+        _notificationOutboxProxy.Forward(msg);
+    }
+
+    private void HandleNotificationStatusQuery(NotificationStatusQuery msg)
+    {
+        if (_notificationOutboxProxy == null)
+        {
+            // No outbox proxy registered yet. Reply Found: false so the querying site
+            // falls back to its local Store-and-Forward buffer to resolve the status.
+            _log.Warning(
+                "Cannot route NotificationStatusQuery {0} — notification outbox not available",
+                msg.NotificationId);
+            Sender.Tell(new NotificationStatusResponse(
+                msg.CorrelationId, Found: false, Status: "Unknown",
+                RetryCount: 0, LastError: null, DeliveredAt: null));
+            return;
+        }
+
+        _log.Debug("Routing NotificationStatusQuery {0} to the notification outbox", msg.NotificationId);
+        _notificationOutboxProxy.Forward(msg);
+    }
+
+    private void HandleIngestAuditEvents(IngestAuditEventsCommand msg)
+    {
+        if (_auditIngestProxy == null)
+        {
+            // No ingest proxy registered yet (host startup race). Reply with an
+            // empty IngestAuditEventsReply so the site keeps its rows Pending and
+            // retries — the same behaviour as the gRPC handler's wiring-race path.
+            _log.Warning(
+                "Cannot route IngestAuditEventsCommand ({0} events) — audit ingest not available",
+                msg.Events.Count);
+            Sender.Tell(new IngestAuditEventsReply(Array.Empty<Guid>()));
+            return;
+        }
+
+        // Capture Sender before the async/PipeTo — Akka resets Sender between
+        // dispatches. The reply is piped straight back to the site's ClusterClient.
+        // On an Ask timeout or a faulted reply, PipeTo delivers a Status.Failure to
+        // replyTo: the fault propagates to the caller rather than being swallowed.
+        // The site's own Ask through this path then faults, and the site drain loop
+        // treats that as a transient failure — rows stay Pending and are retried on
+        // the next tick. (The gRPC handler instead returns an empty ack on fault;
+        // propagating the fault here is the cleaner transient signal.)
+        var replyTo = Sender;
+        _log.Debug("Routing IngestAuditEventsCommand ({0} events) to the audit ingest actor", msg.Events.Count);
+        _auditIngestProxy.Ask<IngestAuditEventsReply>(msg, _auditIngestAskTimeout)
+            .PipeTo(replyTo);
+    }
+
+    private void HandleIngestCachedTelemetry(IngestCachedTelemetryCommand msg)
+    {
+        if (_auditIngestProxy == null)
+        {
+            _log.Warning(
+                "Cannot route IngestCachedTelemetryCommand ({0} entries) — audit ingest not available",
+                msg.Entries.Count);
+            Sender.Tell(new IngestCachedTelemetryReply(Array.Empty<Guid>()));
+            return;
+        }
+
+        var replyTo = Sender;
+        _log.Debug("Routing IngestCachedTelemetryCommand ({0} entries) to the audit ingest actor", msg.Entries.Count);
+        _auditIngestProxy.Ask<IngestCachedTelemetryReply>(msg, _auditIngestAskTimeout)
+            .PipeTo(replyTo);
+    }
+
+    private void HandleHeartbeat(HeartbeatMessage heartbeat)
+    {
+        var aggregator = _serviceProvider.GetService<ICentralHealthAggregator>();
+        aggregator?.MarkHeartbeat(heartbeat.SiteId, heartbeat.Timestamp);
+    }
+
+    /// <summary>
+    /// Handles a report delivered directly from a site (via ClusterClient):
+    /// process locally, then fan out to the peer central node so its
+    /// aggregator stays in sync.
+    /// </summary>
+    private void HandleSiteHealthReport(SiteHealthReport report)
+    {
+        ProcessLocally(report);
+
+        try
+        {
+            DistributedPubSub.Get(Context.System).Mediator.Tell(
+                new Publish(HealthReportTopic, new SiteHealthReportReplica(report)));
+        }
+        catch
+        {
+            // No-op in non-clustered hosts (TestKit).
+        }
+    }
+
+    /// <summary>
+    /// Applies a report to the local aggregator without re-broadcasting.
+    /// Used for both site-originated reports and peer-replicated ones — the
+    /// aggregator is idempotent via sequence-number comparison.
+    /// </summary>
+    private void ProcessLocally(SiteHealthReport report)
+    {
+        var aggregator = _serviceProvider.GetService<ICentralHealthAggregator>();
+        if (aggregator != null)
+        {
+            aggregator.ProcessReport(report);
+        }
+        else
+        {
+            _log.Warning("ICentralHealthAggregator not available, dropping health report from site {0}", report.SiteId);
+        }
+    }
+
+    // Communication-016: HandleConnectionStateChanged removed — no production
+    // caller emitted ConnectionStateChanged, so the workflow ran only in tests.
+    // Disconnect detection is owned by the transport layers (gRPC keepalive +
+    // ClusterClient/Ask timeout).
+
+    private void HandleSiteEnvelope(SiteEnvelope envelope)
+    {
+        if (!_siteClients.TryGetValue(envelope.SiteId, out var entry))
+        {
+            _log.Warning("No ClusterClient for site {0}, cannot route message {1}",
+                envelope.SiteId, envelope.Message.GetType().Name);
+
+            // The Ask will timeout on the caller side — no central buffering (WP-5)
+            return;
+        }
+
+        // Route via ClusterClient — Sender is preserved for Ask response routing
+        entry.Client.Tell(
+            new ClusterClient.Send("/user/site-communication", envelope.Message),
+            Sender);
+    }
+
+    private void LoadSiteAddressesFromDb()
+    {
+        var self = Self;
+        // Communication-019: pass the actor's lifecycle CT into the repository
+        // call so a hung database query is cancelled when the actor stops
+        // rather than leaving the piped task to accumulate. Captured locally
+        // because the lifecycle CTS may have been disposed by PostStop on a
+        // racing late tick; treat that as "actor gone, give up".
+        CancellationToken ct;
+        try
+        {
+            ct = _lifecycleCts.Token;
+        }
+        catch (ObjectDisposedException)
+        {
+            return;
+        }
+
+        Task.Run(async () =>
+        {
+            using var scope = _serviceProvider.CreateScope();
+            var repo = scope.ServiceProvider.GetRequiredService<ISiteRepository>();
+            var sites = await repo.GetAllSitesAsync(ct).ConfigureAwait(false);
+
+            var contacts = new Dictionary<string, List<string>>();
+            foreach (var site in sites)
+            {
+                var addrs = new List<string>();
+                if (!string.IsNullOrWhiteSpace(site.NodeAAddress))
+                {
+                    var addr = site.NodeAAddress;
+                    // Strip actor path suffix if present (legacy format)
+                    var idx = addr.IndexOf("/user/");
+                    if (idx > 0) addr = addr.Substring(0, idx);
+                    addrs.Add(addr);
+                }
+                if (!string.IsNullOrWhiteSpace(site.NodeBAddress))
+                {
+                    var addr = site.NodeBAddress;
+                    var idx = addr.IndexOf("/user/");
+                    if (idx > 0) addr = addr.Substring(0, idx);
+                    addrs.Add(addr);
+                }
+                if (addrs.Count > 0)
+                    contacts[site.SiteIdentifier] = addrs;
+            }
+
+            // Communication-020: freeze the cross-task payload before piping to
+            // Self. The message record exposes read-only types (
+            // IReadOnlyDictionary / IReadOnlyList) so the Akka.NET message-
+            // immutability convention is enforced by type, not just convention.
+            var frozen = contacts.ToDictionary(
+                kvp => kvp.Key,
+                kvp => (IReadOnlyList<string>)kvp.Value.AsReadOnly());
+            return new SiteAddressCacheLoaded(frozen);
+        }).PipeTo(self);
+    }
+
+    private void HandleSiteAddressCacheLoaded(SiteAddressCacheLoaded msg)
+    {
+        var newSiteIds = msg.SiteContacts.Keys.ToHashSet();
+        var existingSiteIds = _siteClients.Keys.ToHashSet();
+
+        // Stop ClusterClients for removed sites
+        foreach (var removed in existingSiteIds.Except(newSiteIds))
+        {
+            _log.Info("Stopping ClusterClient for removed site {0}", removed);
+            Context.Stop(_siteClients[removed].Client);
+            _siteClients.Remove(removed);
+        }
+
+        // Add or update
+        foreach (var (siteId, addresses) in msg.SiteContacts)
+        {
+            // Communication-009: parse all addresses up front inside a try/catch so a
+            // single malformed site row cannot abort the whole refresh loop and leave
+            // the cache half-updated. A bad site is logged and skipped; others proceed.
+            ImmutableHashSet<ActorPath> contactPaths;
+            try
+            {
+                contactPaths = addresses
+                    .Select(a => ActorPath.Parse($"{a}/system/receptionist"))
+                    .ToImmutableHashSet();
+            }
+            catch (Exception ex)
+            {
+                _log.Warning(ex,
+                    "Malformed contact address for site {0}; skipping this site in the refresh "
+                    + "(other sites are unaffected)", siteId);
+                continue;
+            }
+
+            var contactStrings = addresses.ToImmutableHashSet();
+
+            // Skip if unchanged
+            if (_siteClients.TryGetValue(siteId, out var existing) && existing.ContactAddresses.SetEquals(contactStrings))
+                continue;
+
+            // Stop old client if addresses changed
+            if (_siteClients.ContainsKey(siteId))
+            {
+                _log.Info("Updating ClusterClient for site {0} (addresses changed)", siteId);
+                Context.Stop(_siteClients[siteId].Client);
+            }
+
+            var client = _siteClientFactory.Create(Context.System, siteId, contactPaths);
+            _siteClients[siteId] = (client, contactStrings);
+            _log.Info("Created ClusterClient for site {0} with {1} contact(s)", siteId, addresses.Count);
+        }
+
+        _log.Info("Site ClusterClient cache refreshed with {0} site(s)", _siteClients.Count);
+    }
+
+    // Communication-016: TrackMessageForCleanup removed — the dicts it fed
+    // existed solely to support the dead ConnectionStateChanged workflow.
+
+    /// <inheritdoc />
+    protected override SupervisorStrategy SupervisorStrategy()
+    {
+        return new OneForOneStrategy(
+            maxNrOfRetries: -1,
+            withinTimeRange: Timeout.InfiniteTimeSpan,
+            decider: Decider.From(ex =>
+            {
+                _log.Warning(ex, "Child actor of CentralCommunicationActor faulted, resuming (state preserved)");
+                return Directive.Resume;
+            }));
+    }
+
+    /// <inheritdoc />
+    protected override void PreStart()
+    {
+        _log.Info("CentralCommunicationActor started");
+
+        // Subscribe to the peer-replication topic so we receive health reports
+        // delivered to the other central node and keep our local aggregator
+        // in sync (ClusterClient load-balances reports across nodes).
+        // Tolerant of non-clustered hosts (TestKit) where the extension is absent.
+        try
+        {
+            DistributedPubSub.Get(Context.System).Mediator.Tell(
+                new Subscribe(HealthReportTopic, Self));
+        }
+        catch (Exception ex)
+        {
+            _log.Debug("DistributedPubSub not available — peer health replication disabled: {0}", ex.Message);
+        }
+
+        // Schedule periodic refresh of site addresses from the database
+        _refreshSchedule = Context.System.Scheduler.ScheduleTellRepeatedlyCancelable(
+            TimeSpan.Zero,
+            TimeSpan.FromSeconds(60),
+            Self,
+            new RefreshSiteAddresses(),
+            ActorRefs.NoSender);
+    }
+
+    /// <inheritdoc />
+    protected override void PostStop()
+    {
+        _log.Info("CentralCommunicationActor stopped");
+        _refreshSchedule?.Cancel();
+        // Communication-019: cancel any in-flight LoadSiteAddressesFromDb so a
+        // hung MS SQL query does not outlive the actor.
+        try
+        {
+            _lifecycleCts.Cancel();
+        }
+        catch (ObjectDisposedException)
+        {
+            // Double-stop is benign.
+        }
+        _lifecycleCts.Dispose();
+    }
+}
+
+/// <summary>
+/// Command to trigger a refresh of site addresses from the database.
+/// </summary>
+public record RefreshSiteAddresses;
+
+/// <summary>
+/// Internal message carrying the loaded site contact data from the database.
+/// ClusterClient creation happens on the actor thread in HandleSiteAddressCacheLoaded.
+///
+/// Communication-020: the payload is exposed as <see cref="IReadOnlyDictionary{TKey,TValue}"/>
+/// of <see cref="IReadOnlyList{T}"/> so the Akka.NET "messages are immutable"
+/// convention is enforced at the type level rather than relying on producer
+/// discipline. The producer wraps the constructed buckets with
+/// <c>List&lt;T&gt;.AsReadOnly()</c> before piping to Self.
+/// </summary>
+internal record SiteAddressCacheLoaded(IReadOnlyDictionary<string, IReadOnlyList<string>> SiteContacts);
+
+/// <summary>
+/// Notification sent to debug view subscribers when the stream is terminated
+/// due to site disconnection (WP-5).
+/// </summary>
+public record DebugStreamTerminated(string SiteId, string CorrelationId);
+
+/// <summary>
+/// Registers the central NotificationOutboxActor singleton proxy with the
+/// <see cref="CentralCommunicationActor"/> so site-forwarded <see cref="NotificationSubmit"/>
+/// and <see cref="NotificationStatusQuery"/> messages can be routed to it. Sent by the Host
+/// after the outbox singleton proxy is created.
+/// </summary>
+public record RegisterNotificationOutbox(IActorRef OutboxProxy);
+
+/// <summary>
+/// Registers the central AuditLogIngestActor singleton proxy with the
+/// <see cref="CentralCommunicationActor"/> so site-forwarded
+/// <see cref="IngestAuditEventsCommand"/> and <see cref="IngestCachedTelemetryCommand"/>
+/// messages can be routed to it. Sent by the Host after the audit-ingest
+/// singleton proxy is created. Lives here (not in Commons) because
+/// <c>ZB.MOM.WW.ScadaBridge.Commons</c> has no Akka package reference and cannot hold an
+/// <see cref="IActorRef"/> field.
+/// </summary>
+public sealed record RegisterAuditIngest(IActorRef AuditIngestActor);
@@ -0,0 +1,291 @@
+using Akka.Actor;
+using Akka.Event;
+using ZB.MOM.WW.ScadaBridge.Commons.Messages.DebugView;
+using ZB.MOM.WW.ScadaBridge.Commons.Messages.Streaming;
+using ZB.MOM.WW.ScadaBridge.Communication.Grpc;
+
+namespace ZB.MOM.WW.ScadaBridge.Communication.Actors;
+
+/// <summary>
+/// Long-lived (one per active debug session) actor on the central side. Debug sessions
+/// are session-based and temporary — this actor holds no persisted state and does not
+/// derive from an Akka.Persistence base class; its state does not survive a restart.
+/// Sends SubscribeDebugViewRequest to the site via CentralCommunicationActor (with THIS actor
+/// as the Sender) to get the initial snapshot. After receiving the snapshot, opens a gRPC
+/// server-streaming subscription via SiteStreamGrpcClient for ongoing events.
+/// Stream events are marshalled back to the actor via Self.Tell for thread safety.
+/// </summary>
+public class DebugStreamBridgeActor : ReceiveActor, IWithTimers
+{
+    private readonly ILoggingAdapter _log = Context.GetLogger();
+    private readonly string _siteIdentifier;
+    private readonly string _instanceUniqueName;
+    private readonly string _correlationId;
+    private readonly IActorRef _centralCommunicationActor;
+    private readonly Action<object> _onEvent;
+    private readonly Action _onTerminated;
+    private readonly SiteStreamGrpcClientFactory _grpcFactory;
+    private readonly string _grpcNodeAAddress;
+    private readonly string _grpcNodeBAddress;
+
+    private const int MaxRetries = 3;
+    private const string ReconnectTimerKey = "grpc-reconnect";
+    private const string StabilityTimerKey = "grpc-stability";
+    /// <summary>Delay between gRPC reconnection attempts.</summary>
+    internal static TimeSpan ReconnectDelay { get; set; } = TimeSpan.FromSeconds(5);
+
+    /// <summary>
+    /// How long a freshly-opened gRPC stream must stay up before its retry budget
+    /// is considered "recovered" and <see cref="_retryCount"/> is reset to 0.
+    /// Communication-008: the retry count must NOT be reset by individual events —
+    /// a stream that connects, delivers one event, then fails repeatedly would
+    /// otherwise reconnect forever and never trip <see cref="MaxRetries"/>. Resetting
+    /// only after a stable interval bounds a flapping stream.
+    /// </summary>
+    internal static TimeSpan StabilityWindow { get; set; } = TimeSpan.FromSeconds(60);
+
+    private int _retryCount;
+    private bool _useNodeA = true;
+    private bool _stopped;
+    private CancellationTokenSource? _grpcCts;
+
+    /// <summary>Timer scheduler for reconnect and stability window timers.</summary>
+    public ITimerScheduler Timers { get; set; } = null!;
+
+    /// <summary>
+    /// Initializes the debug stream bridge actor and registers message handlers.
+    /// </summary>
+    /// <param name="siteIdentifier">Site identifier for targeting ClusterClient messages and logging.</param>
+    /// <param name="instanceUniqueName">Unique name of the instance whose debug stream is being bridged.</param>
+    /// <param name="correlationId">Correlation id for the debug session.</param>
+    /// <param name="centralCommunicationActor">Actor used to forward ClusterClient messages to the site.</param>
+    /// <param name="onEvent">Callback invoked on each received debug event.</param>
+    /// <param name="onTerminated">Callback invoked when the stream terminates.</param>
+    /// <param name="grpcFactory">Factory for creating gRPC streaming clients.</param>
+    /// <param name="grpcNodeAAddress">gRPC address of the site's node A.</param>
+    /// <param name="grpcNodeBAddress">gRPC address of the site's node B.</param>
+    public DebugStreamBridgeActor(
+        string siteIdentifier,
+        string instanceUniqueName,
+        string correlationId,
+        IActorRef centralCommunicationActor,
+        Action<object> onEvent,
+        Action onTerminated,
+        SiteStreamGrpcClientFactory grpcFactory,
+        string grpcNodeAAddress,
+        string grpcNodeBAddress)
+    {
+        _siteIdentifier = siteIdentifier;
+        _instanceUniqueName = instanceUniqueName;
+        _correlationId = correlationId;
+        _centralCommunicationActor = centralCommunicationActor;
+        _onEvent = onEvent;
+        _onTerminated = onTerminated;
+        _grpcFactory = grpcFactory;
+        _grpcNodeAAddress = grpcNodeAAddress;
+        _grpcNodeBAddress = grpcNodeBAddress;
+
+        // Initial snapshot response from the site (via ClusterClient)
+        Receive<DebugViewSnapshot>(snapshot =>
+        {
+            _log.Info("Received initial snapshot for {0} ({1} attrs, {2} alarms)",
+                _instanceUniqueName, snapshot.AttributeValues.Count, snapshot.AlarmStates.Count);
+            _onEvent(snapshot);
+            OpenGrpcStream();
+        });
+
+        // Domain events arriving via Self.Tell from gRPC callback.
+        // Communication-008: receiving an event must NOT reset _retryCount — a
+        // flapping stream that delivers a single event between failures would
+        // otherwise never trip MaxRetries. The retry budget is recovered only by
+        // GrpcStreamStable (a stream that has stayed up for StabilityWindow).
+        Receive<AttributeValueChanged>(changed => _onEvent(changed));
+        Receive<AlarmStateChanged>(changed => _onEvent(changed));
+
+        // Stream has been stably connected for StabilityWindow — recover the
+        // retry budget so a future transient fault gets a fresh set of retries.
+        Receive<GrpcStreamStable>(_ =>
+        {
+            if (_stopped) return;
+            _retryCount = 0;
+            _log.Debug("gRPC stream for {0} stable, retry count reset", _instanceUniqueName);
+        });
+
+        // gRPC stream error — attempt reconnection
+        Receive<GrpcStreamError>(msg =>
+        {
+            _log.Warning("gRPC stream error for {0}: {1}", _instanceUniqueName, msg.Exception.Message);
+            HandleGrpcError();
+        });
+
+        // Scheduled reconnection
+        Receive<ReconnectGrpcStream>(_ => OpenGrpcStream());
+
+        // Consumer requests stop
+        Receive<StopDebugStream>(_ =>
+        {
+            _log.Info("Stopping debug stream for {0}", _instanceUniqueName);
+            CleanupGrpc();
+            SendUnsubscribe();
+            _stopped = true;
+            Context.Stop(Self);
+        });
+
+        // Site disconnected — CentralCommunicationActor notifies us
+        Receive<DebugStreamTerminated>(msg =>
+        {
+            if (_stopped) return; // Idempotent — gRPC error may arrive simultaneously
+            _log.Warning("Debug stream terminated for {0} (site {1} disconnected)", _instanceUniqueName, msg.SiteId);
+            CleanupGrpc();
+            _stopped = true;
+            _onTerminated();
+            Context.Stop(Self);
+        });
+
+        // Orphan safety net — if nobody stops us within 5 minutes, self-terminate
+        Context.SetReceiveTimeout(TimeSpan.FromMinutes(5));
+        Receive<ReceiveTimeout>(_ =>
+        {
+            _log.Warning("Debug stream for {0} timed out (orphaned session), stopping", _instanceUniqueName);
+            CleanupGrpc();
+            SendUnsubscribe();
+            _stopped = true;
+            _onTerminated();
+            Context.Stop(Self);
+        });
+    }
+
+    /// <inheritdoc />
+    protected override void PreStart()
+    {
+        _log.Info("Starting debug stream bridge for {0} on site {1}", _instanceUniqueName, _siteIdentifier);
+
+        // Send subscribe request via CentralCommunicationActor for the initial snapshot.
+        var request = new SubscribeDebugViewRequest(_instanceUniqueName, _correlationId);
+        var envelope = new SiteEnvelope(_siteIdentifier, request);
+        _centralCommunicationActor.Tell(envelope, Self);
+    }
+
+    /// <inheritdoc />
+    protected override void PostStop()
+    {
+        _grpcCts?.Cancel();
+        _grpcCts?.Dispose();
+        _grpcCts = null;
+        base.PostStop();
+    }
+
+    private void OpenGrpcStream()
+    {
+        if (_stopped) return;
+
+        var endpoint = _useNodeA ? _grpcNodeAAddress : _grpcNodeBAddress;
+        _log.Info("Opening gRPC stream for {0} to {1}", _instanceUniqueName, endpoint);
+
+        _grpcCts?.Cancel();
+        _grpcCts?.Dispose();
+        _grpcCts = new CancellationTokenSource();
+
+        // Arm the stability timer: if the stream stays up for StabilityWindow the
+        // retry budget is recovered (Communication-008). Cancelled by HandleGrpcError.
+        Timers.StartSingleTimer(StabilityTimerKey, new GrpcStreamStable(), StabilityWindow);
+
+        var client = _grpcFactory.GetOrCreate(_siteIdentifier, endpoint);
+        var self = Self;
+        var ct = _grpcCts.Token;
+
+        // Launch as background task — onEvent and onError marshal back to actor via Tell
+        Task.Run(async () =>
+        {
+            await client.SubscribeAsync(
+                _correlationId,
+                _instanceUniqueName,
+                evt => self.Tell(evt),
+                ex => self.Tell(new GrpcStreamError(ex)),
+                ct);
+        }, ct);
+    }
+
+    private void HandleGrpcError()
+    {
+        if (_stopped) return;
+
+        // The stream failed before reaching the stability window — its retry
+        // budget is NOT recovered (Communication-008).
+        Timers.Cancel(StabilityTimerKey);
+
+        _retryCount++;
+
+        if (_retryCount > MaxRetries)
+        {
+            _log.Error("gRPC stream for {0} exceeded max retries ({1}), terminating", _instanceUniqueName, MaxRetries);
+            CleanupGrpc();
+            _stopped = true;
+            _onTerminated();
+            Context.Stop(Self);
+            return;
+        }
+
+        // Unsubscribe the failed stream on the *previous* endpoint before reconnecting.
+        // This cancels the local subscription CTS and -- where the channel is still
+        // alive -- propagates gRPC cancellation to the site so its SiteStreamGrpcServer
+        // stops the StreamRelayActor for this correlation ID, rather than leaving a
+        // zombie relay actor until TCP RST / keepalive eventually detects the loss.
+        var previousEndpoint = _useNodeA ? _grpcNodeAAddress : _grpcNodeBAddress;
+        var previousClient = _grpcFactory.GetOrCreate(_siteIdentifier, previousEndpoint);
+        previousClient.Unsubscribe(_correlationId);
+
+        // Flip to the other node
+        _useNodeA = !_useNodeA;
+
+        // First retry is immediate, subsequent retries use a short backoff
+        if (_retryCount == 1)
+        {
+            Self.Tell(new ReconnectGrpcStream());
+        }
+        else
+        {
+            Timers.StartSingleTimer(ReconnectTimerKey, new ReconnectGrpcStream(), ReconnectDelay);
+        }
+    }
+
+    private void CleanupGrpc()
+    {
+        _grpcCts?.Cancel();
+        _grpcCts?.Dispose();
+        _grpcCts = null;
+
+        var client = _grpcFactory.GetOrCreate(_siteIdentifier,
+            _useNodeA ? _grpcNodeAAddress : _grpcNodeBAddress);
+        client.Unsubscribe(_correlationId);
+    }
+
+    private void SendUnsubscribe()
+    {
+        var request = new UnsubscribeDebugViewRequest(_instanceUniqueName, _correlationId);
+        var envelope = new SiteEnvelope(_siteIdentifier, request);
+        _centralCommunicationActor.Tell(envelope, Self);
+    }
+}
+
+/// <summary>
+/// Message sent to a DebugStreamBridgeActor to stop the debug stream session.
+/// </summary>
+public record StopDebugStream;
+
+/// <summary>
+/// Internal message indicating a gRPC stream error occurred.
+/// </summary>
+internal record GrpcStreamError(Exception Exception);
+
+/// <summary>
+/// Internal message to trigger gRPC stream reconnection.
+/// </summary>
+internal record ReconnectGrpcStream;
+
+/// <summary>
+/// Internal message indicating the current gRPC stream has been connected long
+/// enough (<see cref="DebugStreamBridgeActor.StabilityWindow"/>) to be considered
+/// stable, so the reconnect retry budget can be recovered.
+/// </summary>
+internal record GrpcStreamStable;
@@ -0,0 +1,457 @@
+using Akka.Actor;
+using Akka.Cluster;
+using Akka.Cluster.Tools.Client;
+using Akka.Event;
+using ZB.MOM.WW.ScadaBridge.Commons.Messages.Artifacts;
+using ZB.MOM.WW.ScadaBridge.Commons.Messages.Audit;
+using ZB.MOM.WW.ScadaBridge.Commons.Messages.DebugView;
+using ZB.MOM.WW.ScadaBridge.Commons.Messages.Deployment;
+using ZB.MOM.WW.ScadaBridge.Commons.Messages.Health;
+using ZB.MOM.WW.ScadaBridge.Commons.Messages.InboundApi;
+using ZB.MOM.WW.ScadaBridge.Commons.Messages.Integration;
+using ZB.MOM.WW.ScadaBridge.Commons.Messages.Lifecycle;
+using ZB.MOM.WW.ScadaBridge.Commons.Messages.Notification;
+using ZB.MOM.WW.ScadaBridge.Commons.Messages.RemoteQuery;
+
+namespace ZB.MOM.WW.ScadaBridge.Communication.Actors;
+
+/// <summary>
+/// Site-side actor that receives messages from central via ClusterClient and routes
+/// them to the appropriate local actors. Also sends heartbeats and health reports
+/// to central via the registered ClusterClient.
+///
+/// WP-4: Routes all 8 message patterns to local handlers.
+/// </summary>
+public class SiteCommunicationActor : ReceiveActor, IWithTimers
+{
+    private readonly ILoggingAdapter _log = Context.GetLogger();
+    private readonly string _siteId;
+    private readonly CommunicationOptions _options;
+
+    /// <summary>
+    /// Communication-018: predicate that returns <c>true</c> when this node is
+    /// the active member of the local site cluster (used to stamp
+    /// <see cref="HeartbeatMessage.IsActive"/>). Production builds default to
+    /// the Akka <see cref="Cluster"/> leader check; tests inject a stub so they
+    /// do not need a real cluster.
+    /// </summary>
+    private readonly Func<bool> _isActiveCheck;
+
+    /// <summary>
+    /// Reference to the local Deployment Manager singleton proxy.
+    /// </summary>
+    private readonly IActorRef _deploymentManagerProxy;
+
+    /// <summary>
+    /// ClusterClient reference for sending messages to the central cluster.
+    /// Set via RegisterCentralClient message.
+    /// </summary>
+    private IActorRef? _centralClient;
+
+    /// <summary>
+    /// Local actor references for routing specific message patterns.
+    /// Populated via registration messages.
+    /// </summary>
+    private IActorRef? _eventLogHandler;
+    private IActorRef? _parkedMessageHandler;
+    private IActorRef? _integrationHandler;
+    private IActorRef? _artifactHandler;
+
+    /// <summary>Akka timer scheduler injected by the framework via <see cref="IWithTimers"/>.</summary>
+    public ITimerScheduler Timers { get; set; } = null!;
+
+    /// <summary>Initializes the actor, wires all message pattern handlers, and schedules the periodic heartbeat.</summary>
+    /// <param name="siteId">The site identifier included in outbound messages.</param>
+    /// <param name="options">Communication options including heartbeat interval and transport settings.</param>
+    /// <param name="deploymentManagerProxy">Local reference to the Deployment Manager singleton proxy.</param>
+    /// <param name="isActiveCheck">
+    /// Communication-018: optional override returning <c>true</c> when this node
+    /// is the active member of the site cluster. <c>null</c> uses the real
+    /// Akka <see cref="Cluster"/> leader check (the default for production
+    /// wiring); tests pass a stub so they do not need to load Akka.Cluster
+    /// into the <c>TestKit</c> ActorSystem.
+    /// </param>
+    public SiteCommunicationActor(
+        string siteId,
+        CommunicationOptions options,
+        IActorRef deploymentManagerProxy,
+        Func<bool>? isActiveCheck = null)
+    {
+        _siteId = siteId;
+        _options = options;
+        _deploymentManagerProxy = deploymentManagerProxy;
+        _isActiveCheck = isActiveCheck ?? DefaultIsActiveCheck;
+
+        // Registration
+        Receive<RegisterCentralClient>(msg =>
+        {
+            _centralClient = msg.Client;
+            _log.Info("Registered central ClusterClient");
+        });
+        Receive<RegisterLocalHandler>(HandleRegisterLocalHandler);
+
+        // Pattern 1: Instance Deployment — forward to Deployment Manager
+        Receive<DeployInstanceCommand>(msg =>
+        {
+            _log.Debug("Routing DeployInstanceCommand for {0} to DeploymentManager", msg.InstanceUniqueName);
+            _deploymentManagerProxy.Forward(msg);
+        });
+
+        // Pattern 2: Lifecycle — forward to Deployment Manager
+        Receive<DisableInstanceCommand>(msg => _deploymentManagerProxy.Forward(msg));
+        Receive<EnableInstanceCommand>(msg => _deploymentManagerProxy.Forward(msg));
+        Receive<DeleteInstanceCommand>(msg => _deploymentManagerProxy.Forward(msg));
+
+        // DeploymentManager-006: query-the-site-before-redeploy — forward to
+        // the Deployment Manager, which owns the deployed-config store and
+        // answers with the instance's currently-applied deployment identity.
+        Receive<DeploymentStateQueryRequest>(msg => _deploymentManagerProxy.Forward(msg));
+
+        // Pattern 3: Artifact Deployment — forward to artifact handler if registered
+        Receive<DeployArtifactsCommand>(msg =>
+        {
+            if (_artifactHandler != null)
+                _artifactHandler.Forward(msg);
+            else
+            {
+                _log.Warning("No artifact handler registered, replying with failure");
+                Sender.Tell(new ArtifactDeploymentResponse(
+                    msg.DeploymentId, _siteId, false, "Artifact handler not available", DateTimeOffset.UtcNow));
+            }
+        });
+
+        // Pattern 4: Integration Routing — forward to integration handler
+        Receive<IntegrationCallRequest>(msg =>
+        {
+            if (_integrationHandler != null)
+                _integrationHandler.Forward(msg);
+            else
+            {
+                Sender.Tell(new IntegrationCallResponse(
+                    msg.CorrelationId, _siteId, false, null, "Integration handler not available", DateTimeOffset.UtcNow));
+            }
+        });
+
+        // Pattern 5: Debug View — forward to Deployment Manager (which routes to Instance Actor)
+        Receive<SubscribeDebugViewRequest>(msg => _deploymentManagerProxy.Forward(msg));
+        Receive<UnsubscribeDebugViewRequest>(msg => _deploymentManagerProxy.Forward(msg));
+
+        // Pattern 6a: Debug Snapshot (one-shot) — forward to Deployment Manager
+        Receive<DebugSnapshotRequest>(msg => _deploymentManagerProxy.Forward(msg));
+
+        // Inbound API Route.To() — forward to Deployment Manager for instance routing
+        Receive<RouteToCallRequest>(msg => _deploymentManagerProxy.Forward(msg));
+        Receive<RouteToGetAttributesRequest>(msg => _deploymentManagerProxy.Forward(msg));
+        Receive<RouteToSetAttributesRequest>(msg => _deploymentManagerProxy.Forward(msg));
+
+        // Pattern 7: Remote Queries
+        Receive<EventLogQueryRequest>(msg =>
+        {
+            if (_eventLogHandler != null)
+                _eventLogHandler.Forward(msg);
+            else
+            {
+                Sender.Tell(new EventLogQueryResponse(
+                    msg.CorrelationId, _siteId, [], null, false, false,
+                    "Event log handler not available", DateTimeOffset.UtcNow));
+            }
+        });
+
+        Receive<ParkedMessageQueryRequest>(msg =>
+        {
+            if (_parkedMessageHandler != null)
+                _parkedMessageHandler.Forward(msg);
+            else
+            {
+                Sender.Tell(new ParkedMessageQueryResponse(
+                    msg.CorrelationId, _siteId, [], 0, msg.PageNumber, msg.PageSize, false,
+                    "Parked message handler not available", DateTimeOffset.UtcNow));
+            }
+        });
+
+        Receive<ParkedMessageRetryRequest>(msg =>
+        {
+            if (_parkedMessageHandler != null)
+                _parkedMessageHandler.Forward(msg);
+            else
+            {
+                Sender.Tell(new ParkedMessageRetryResponse(
+                    msg.CorrelationId, false, "Parked message handler not available"));
+            }
+        });
+
+        Receive<ParkedMessageDiscardRequest>(msg =>
+        {
+            if (_parkedMessageHandler != null)
+                _parkedMessageHandler.Forward(msg);
+            else
+            {
+                Sender.Tell(new ParkedMessageDiscardResponse(
+                    msg.CorrelationId, false, "Parked message handler not available"));
+            }
+        });
+
+        // Task 5 (#22): central→site Retry/Discard relay for parked cached
+        // operations. SiteCallAuditActor relays these over the command/control
+        // channel; the parked-message handler executes them against the local
+        // S&F buffer and replies a ParkedOperationActionAck that routes back to
+        // the relaying SiteCallAuditActor's Ask.
+        Receive<RetryParkedOperation>(msg =>
+        {
+            if (_parkedMessageHandler != null)
+                _parkedMessageHandler.Forward(msg);
+            else
+            {
+                Sender.Tell(new ParkedOperationActionAck(
+                    msg.CorrelationId, Applied: false, "Parked message handler not available"));
+            }
+        });
+
+        Receive<DiscardParkedOperation>(msg =>
+        {
+            if (_parkedMessageHandler != null)
+                _parkedMessageHandler.Forward(msg);
+            else
+            {
+                Sender.Tell(new ParkedOperationActionAck(
+                    msg.CorrelationId, Applied: false, "Parked message handler not available"));
+            }
+        });
+
+        // Notification Outbox: forward a buffered notification submitted by the site
+        // Store-and-Forward Engine to the central cluster. The original Sender (the
+        // S&F forwarder's Ask) is forwarded as the ClusterClient.Send sender so the
+        // NotificationSubmitAck routes straight back to the waiting Ask, not here.
+        Receive<NotificationSubmit>(msg =>
+        {
+            if (_centralClient == null)
+            {
+                // No ClusterClient registered yet (e.g. central contact points not
+                // configured, or registration not yet completed). A non-accepted ack
+                // makes the S&F forwarder treat this as transient and retry later.
+                _log.Warning(
+                    "Cannot forward NotificationSubmit {0} — no central ClusterClient registered",
+                    msg.NotificationId);
+                Sender.Tell(new NotificationSubmitAck(
+                    msg.NotificationId, Accepted: false, Error: "Central ClusterClient not registered"));
+                return;
+            }
+
+            _log.Debug("Forwarding NotificationSubmit {0} to central", msg.NotificationId);
+            _centralClient.Tell(
+                new ClusterClient.Send("/user/central-communication", msg), Sender);
+        });
+
+        // Notification Outbox: forward a Notify.Status query to the central cluster.
+        // The original Sender (the Notify helper's Ask) is forwarded as the
+        // ClusterClient.Send sender so the NotificationStatusResponse routes straight
+        // back to the waiting Ask, not here.
+        Receive<NotificationStatusQuery>(msg =>
+        {
+            if (_centralClient == null)
+            {
+                // No ClusterClient registered yet. Reply Found: false so Notify.Status
+                // falls back to the site S&F buffer to decide Forwarding vs Unknown.
+                _log.Warning(
+                    "Cannot forward NotificationStatusQuery {0} — no central ClusterClient registered",
+                    msg.NotificationId);
+                Sender.Tell(new NotificationStatusResponse(
+                    msg.CorrelationId, Found: false, Status: "Unknown",
+                    RetryCount: 0, LastError: null, DeliveredAt: null));
+                return;
+            }
+
+            _log.Debug("Forwarding NotificationStatusQuery {0} to central", msg.NotificationId);
+            _centralClient.Tell(
+                new ClusterClient.Send("/user/central-communication", msg), Sender);
+        });
+
+        // Audit Log (#23): forward a batch of site-local audit events to the
+        // central cluster. The site SiteAuditTelemetryActor drains its SQLite
+        // Pending queue through the ClusterClientSiteAuditClient, which Asks
+        // this actor; the original Sender (that Ask) is passed as the
+        // ClusterClient.Send sender so the IngestAuditEventsReply routes
+        // straight back to the waiting Ask, not here. Mirrors NotificationSubmit.
+        Receive<IngestAuditEventsCommand>(msg =>
+        {
+            if (_centralClient == null)
+            {
+                // No ClusterClient registered yet (e.g. central contact points
+                // not configured, or registration not yet completed). Faulting
+                // the Ask makes the SiteAuditTelemetryActor drain loop treat
+                // this as transient and keep the rows Pending for the next tick.
+                _log.Warning(
+                    "Cannot forward IngestAuditEventsCommand ({0} events) — no central ClusterClient registered",
+                    msg.Events.Count);
+                Sender.Tell(new Status.Failure(
+                    new InvalidOperationException("Central ClusterClient not registered")));
+                return;
+            }
+
+            _log.Debug("Forwarding IngestAuditEventsCommand ({0} events) to central", msg.Events.Count);
+            _centralClient.Tell(
+                new ClusterClient.Send("/user/central-communication", msg), Sender);
+        });
+
+        // Audit Log (#23) M3: forward a batch of combined cached-call telemetry
+        // packets to the central cluster. Same forward + reply-routing pattern
+        // as IngestAuditEventsCommand; central replies with an
+        // IngestCachedTelemetryReply.
+        Receive<IngestCachedTelemetryCommand>(msg =>
+        {
+            if (_centralClient == null)
+            {
+                _log.Warning(
+                    "Cannot forward IngestCachedTelemetryCommand ({0} entries) — no central ClusterClient registered",
+                    msg.Entries.Count);
+                Sender.Tell(new Status.Failure(
+                    new InvalidOperationException("Central ClusterClient not registered")));
+                return;
+            }
+
+            _log.Debug("Forwarding IngestCachedTelemetryCommand ({0} entries) to central", msg.Entries.Count);
+            _centralClient.Tell(
+                new ClusterClient.Send("/user/central-communication", msg), Sender);
+        });
+
+        // Internal: send heartbeat tick
+        Receive<SendHeartbeat>(_ => SendHeartbeatToCentral());
+
+        // Internal: forward health report to central
+        Receive<SiteHealthReport>(msg =>
+        {
+            _centralClient?.Tell(
+                new ClusterClient.Send("/user/central-communication", msg), Self);
+        });
+
+    }
+
+    /// <inheritdoc />
+    protected override SupervisorStrategy SupervisorStrategy()
+    {
+        return new OneForOneStrategy(
+            maxNrOfRetries: -1,
+            withinTimeRange: Timeout.InfiniteTimeSpan,
+            decider: Decider.From(ex =>
+            {
+                _log.Warning(ex, "Child actor of SiteCommunicationActor faulted, resuming (state preserved)");
+                return Directive.Resume;
+            }));
+    }
+
+    /// <inheritdoc />
+    protected override void PreStart()
+    {
+        _log.Info("SiteCommunicationActor started for site {0}", _siteId);
+
+        // Schedule periodic heartbeat to central
+        Timers.StartPeriodicTimer(
+            "heartbeat",
+            new SendHeartbeat(),
+            TimeSpan.FromSeconds(1), // initial delay
+            _options.TransportHeartbeatInterval);
+    }
+
+    private void HandleRegisterLocalHandler(RegisterLocalHandler msg)
+    {
+        switch (msg.HandlerType)
+        {
+            case LocalHandlerType.EventLog:
+                _eventLogHandler = msg.Handler;
+                break;
+            case LocalHandlerType.ParkedMessages:
+                _parkedMessageHandler = msg.Handler;
+                break;
+            case LocalHandlerType.Integration:
+                _integrationHandler = msg.Handler;
+                break;
+            case LocalHandlerType.Artifacts:
+                _artifactHandler = msg.Handler;
+                break;
+        }
+
+        _log.Info("Registered local handler for {0}", msg.HandlerType);
+    }
+
+    private void SendHeartbeatToCentral()
+    {
+        if (_centralClient == null)
+            return;
+
+        var hostname = Environment.MachineName;
+
+        // Communication-018: stamp HeartbeatMessage.IsActive with this node's
+        // true active/standby role rather than hard-coding `true`. The field is
+        // part of the wire contract (additive-only-evolution) so a future
+        // central health dashboard can distinguish "active node down, standby
+        // up" from "site fully offline" without a new message type.
+        bool isActive;
+        try
+        {
+            isActive = _isActiveCheck();
+        }
+        catch (Exception ex)
+        {
+            // Defensive: never let a cluster-state read failure abort the
+            // heartbeat itself (heartbeats are health signal — their absence is
+            // already meaningful). Fall back to the safest non-claiming value:
+            // standby. Logged at Debug because this path normally only fires
+            // during ActorSystem warm-up.
+            _log.Debug(ex,
+                "Active-node check threw while sending heartbeat for site {0}; reporting IsActive=false",
+                _siteId);
+            isActive = false;
+        }
+
+        var heartbeat = new HeartbeatMessage(
+            _siteId,
+            hostname,
+            IsActive: isActive,
+            DateTimeOffset.UtcNow);
+
+        _centralClient.Tell(
+            new ClusterClient.Send("/user/central-communication", heartbeat), Self);
+    }
+
+    /// <summary>
+    /// Communication-018: default active-node check used when no override is
+    /// supplied. Mirrors <c>ActiveNodeGate</c> in the Host (and
+    /// <c>ActiveNodeHealthCheck</c>): the node is the active member of the
+    /// site cluster when it is the current cluster leader AND its own
+    /// <see cref="MemberStatus"/> is <see cref="MemberStatus.Up"/>. Any other
+    /// state (still joining, leaving, no leader yet) reports standby —
+    /// safe-by-default, matching the standby case.
+    /// </summary>
+    private bool DefaultIsActiveCheck()
+    {
+        var cluster = Cluster.Get(Context.System);
+        var self = cluster.SelfMember;
+        if (self.Status != MemberStatus.Up)
+            return false;
+
+        var leader = cluster.State.Leader;
+        return leader != null && leader == self.Address;
+    }
+
+    // ── Internal messages ──
+
+    internal record SendHeartbeat;
+}
+
+/// <summary>
+/// Command to register a ClusterClient for communicating with the central cluster.
+/// </summary>
+public record RegisterCentralClient(IActorRef Client);
+
+/// <summary>
+/// Command to register a local actor as a handler for a specific message pattern.
+/// </summary>
+public record RegisterLocalHandler(LocalHandlerType HandlerType, IActorRef Handler);
+
+public enum LocalHandlerType
+{
+    EventLog,
+    ParkedMessages,
+    Integration,
+    Artifacts
+}
@@ -0,0 +1,108 @@
+using System.Threading.Channels;
+using Akka.Actor;
+using Akka.Event;
+using Google.Protobuf.WellKnownTypes;
+using ZB.MOM.WW.ScadaBridge.Commons.Messages.Streaming;
+using ZB.MOM.WW.ScadaBridge.Commons.Types;
+using ZB.MOM.WW.ScadaBridge.Communication.Grpc;
+using AlarmState = ZB.MOM.WW.ScadaBridge.Commons.Types.Enums.AlarmState;
+using AlarmLevel = ZB.MOM.WW.ScadaBridge.Commons.Types.Enums.AlarmLevel;
+
+namespace ZB.MOM.WW.ScadaBridge.Communication.Actors;
+
+/// <summary>
+/// Lightweight relay actor that bridges Akka domain events (AttributeValueChanged,
+/// AlarmStateChanged) to a System.Threading.Channels.Channel of protobuf SiteStreamEvent
+/// messages. The gRPC server method reads from the channel's reader side.
+/// </summary>
+public class StreamRelayActor : ReceiveActor
+{
+    private readonly ILoggingAdapter _log = Context.GetLogger();
+    private readonly string _correlationId;
+    private readonly ChannelWriter<SiteStreamEvent> _channelWriter;
+
+    /// <summary>
+    /// Initializes a new <see cref="StreamRelayActor"/> for the given gRPC stream correlation.
+    /// </summary>
+    /// <param name="correlationId">Correlation id stamped on every relayed <see cref="SiteStreamEvent"/>.</param>
+    /// <param name="channelWriter">Channel writer to which converted events are written.</param>
+    public StreamRelayActor(string correlationId, ChannelWriter<SiteStreamEvent> channelWriter)
+    {
+        _correlationId = correlationId;
+        _channelWriter = channelWriter;
+
+        Receive<AttributeValueChanged>(HandleAttributeValueChanged);
+        Receive<AlarmStateChanged>(HandleAlarmStateChanged);
+    }
+
+    private void HandleAttributeValueChanged(AttributeValueChanged msg)
+    {
+        var protoEvent = new SiteStreamEvent
+        {
+            CorrelationId = _correlationId,
+            AttributeChanged = new AttributeValueUpdate
+            {
+                InstanceUniqueName = msg.InstanceUniqueName,
+                AttributePath = msg.AttributePath,
+                AttributeName = msg.AttributeName,
+                Value = ValueFormatter.FormatDisplayValue(msg.Value),
+                Quality = MapQuality(msg.Quality),
+                Timestamp = Timestamp.FromDateTimeOffset(msg.Timestamp)
+            }
+        };
+
+        WriteToChannel(protoEvent);
+    }
+
+    private void HandleAlarmStateChanged(AlarmStateChanged msg)
+    {
+        var protoEvent = new SiteStreamEvent
+        {
+            CorrelationId = _correlationId,
+            AlarmChanged = new AlarmStateUpdate
+            {
+                InstanceUniqueName = msg.InstanceUniqueName,
+                AlarmName = msg.AlarmName,
+                State = MapAlarmState(msg.State),
+                Priority = msg.Priority,
+                Timestamp = Timestamp.FromDateTimeOffset(msg.Timestamp),
+                Level = MapAlarmLevel(msg.Level),
+                Message = msg.Message ?? string.Empty
+            }
+        };
+
+        WriteToChannel(protoEvent);
+    }
+
+    private void WriteToChannel(SiteStreamEvent protoEvent)
+    {
+        if (!_channelWriter.TryWrite(protoEvent))
+        {
+            _log.Warning("Channel full, dropping event for correlation {0}", _correlationId);
+        }
+    }
+
+    private static Quality MapQuality(string quality) => quality switch
+    {
+        "Good" => Quality.Good,
+        "Uncertain" => Quality.Uncertain,
+        "Bad" => Quality.Bad,
+        _ => Quality.Unspecified
+    };
+
+    private static AlarmStateEnum MapAlarmState(AlarmState state) => state switch
+    {
+        AlarmState.Normal => AlarmStateEnum.AlarmStateNormal,
+        AlarmState.Active => AlarmStateEnum.AlarmStateActive,
+        _ => AlarmStateEnum.AlarmStateUnspecified
+    };
+
+    private static AlarmLevelEnum MapAlarmLevel(AlarmLevel level) => level switch
+    {
+        AlarmLevel.Low => AlarmLevelEnum.AlarmLevelLow,
+        AlarmLevel.LowLow => AlarmLevelEnum.AlarmLevelLowLow,
+        AlarmLevel.High => AlarmLevelEnum.AlarmLevelHigh,
+        AlarmLevel.HighHigh => AlarmLevelEnum.AlarmLevelHighHigh,
+        _ => AlarmLevelEnum.AlarmLevelNone
+    };
+}