feat(site): replicate config by id + standby fetch (kills the intra-site frame trap)

2026-06-26 14:13:58 -04:00
parent 631ce5bfce
commit 5c2db9fe70
6 changed files with 349 additions and 26 deletions
@@ -2,6 +2,7 @@ using Akka.Actor;
 using Akka.Cluster;
 using Akka.Event;
 using Microsoft.Extensions.Logging;
+using ZB.MOM.WW.ScadaBridge.SiteRuntime.Deployment;
 using ZB.MOM.WW.ScadaBridge.SiteRuntime.Messages;
 using ZB.MOM.WW.ScadaBridge.SiteRuntime.Persistence;
 using ZB.MOM.WW.ScadaBridge.StoreAndForward;
@@ -21,6 +22,7 @@ public class SiteReplicationActor : ReceiveActor
    private readonly SiteStorageService _storage;
    private readonly StoreAndForwardStorage _sfStorage;
    private readonly ReplicationService _replicationService;
+    private readonly IDeploymentConfigFetcher? _configFetcher;
    private readonly string _siteRole;
    private readonly ILogger<SiteReplicationActor> _logger;
    private readonly Cluster _cluster;
@@ -34,16 +36,24 @@ public class SiteReplicationActor : ReceiveActor
    /// <param name="replicationService">Service providing replication transport logic.</param>
    /// <param name="siteRole">Akka cluster role used to identify peer nodes to replicate to.</param>
    /// <param name="logger">Logger instance.</param>
+    /// <param name="configFetcher">
+    /// Fetches a deployed instance's config JSON from central over HTTP. Used by the
+    /// notify-and-fetch standby apply path (<see cref="HandleApplyConfigDeploy"/>): the peer
+    /// replicates only the deployment id, and the standby fetches the config itself so a large
+    /// config never crosses the intra-site Akka hop. Null on nodes/tests without a fetcher.
+    /// </param>
    public SiteReplicationActor(
        SiteStorageService storage,
        StoreAndForwardStorage sfStorage,
        ReplicationService replicationService,
        string siteRole,
-        ILogger<SiteReplicationActor> logger)
+        ILogger<SiteReplicationActor> logger,
+        IDeploymentConfigFetcher? configFetcher = null)
    {
        _storage = storage;
        _sfStorage = sfStorage;
        _replicationService = replicationService;
+        _configFetcher = configFetcher;
        _siteRole = siteRole;
        _logger = logger;
        _cluster = Cluster.Get(Context.System);
@@ -55,7 +65,8 @@ public class SiteReplicationActor : ReceiveActor

        // Outbound — forward to peer
        Receive<ReplicateConfigDeploy>(msg => SendToPeer(new ApplyConfigDeploy(
-            msg.InstanceName, msg.ConfigJson, msg.DeploymentId, msg.RevisionHash, msg.IsEnabled)));
+            msg.InstanceName, msg.DeploymentId, msg.RevisionHash, msg.IsEnabled,
+            msg.CentralFetchBaseUrl, msg.FetchToken)));
        Receive<ReplicateConfigRemove>(msg => SendToPeer(new ApplyConfigRemove(msg.InstanceName)));
        Receive<ReplicateConfigSetEnabled>(msg => SendToPeer(new ApplyConfigSetEnabled(
            msg.InstanceName, msg.IsEnabled)));
@@ -120,7 +131,12 @@ public class SiteReplicationActor : ReceiveActor
        }
    }

-    private void SendToPeer(object message)
+    /// <summary>
+    /// Forwards a replication message to the tracked peer node's <c>site-replication</c> actor
+    /// (fire-and-forget, dropped when no peer is tracked). <see langword="protected virtual"/>
+    /// so tests can intercept the peer send without standing up a real two-node cluster.
+    /// </summary>
+    protected virtual void SendToPeer(object message)
    {
        if (_peerAddress == null)
        {
@@ -136,14 +152,68 @@ public class SiteReplicationActor : ReceiveActor

    private void HandleApplyConfigDeploy(ApplyConfigDeploy msg)
    {
-        _logger.LogInformation("Applying replicated config deploy for {Instance}", msg.InstanceName);
-        _storage.StoreDeployedConfigAsync(
-            msg.InstanceName, msg.ConfigJson, msg.DeploymentId, msg.RevisionHash, msg.IsEnabled)
-            .ContinueWith(t =>
+        if (string.IsNullOrEmpty(msg.CentralFetchBaseUrl))
+        {
+            // The still-present direct DeployInstanceCommand wire path (retired in Task 14)
+            // replicates with empty coords; there is nothing to fetch. Skip quietly rather
+            // than calling FetchAsync("") and logging an error — T18 reconciliation backstops.
+            _logger.LogDebug(
+                "No fetch coords for {Instance} (deployment {DeploymentId}) — skipping replicated fetch; T18 reconciliation is the backstop",
+                msg.InstanceName, msg.DeploymentId);
+            return;
+        }
+
+        if (_configFetcher is null)
+        {
+            _logger.LogWarning(
+                "No config fetcher available; cannot apply replicated config for {Instance} (deployment {DeploymentId}) — reconciliation will backstop",
+                msg.InstanceName, msg.DeploymentId);
+            return;
+        }
+
+        _logger.LogInformation(
+            "Replicating config for {Instance} (deployment {DeploymentId}) — fetching from central",
+            msg.InstanceName, msg.DeploymentId);
+
+        // Notify-and-fetch: the peer sent only the id, so the standby fetches the config
+        // itself (off-thread; best-effort fire-and-forget, matching the no-ack replication
+        // model). The guarded write only overwrites a strictly-older local row. A single
+        // fetch attempt — T18 reconciliation is the durable backstop for a lost fetch.
+        _configFetcher.FetchAsync(msg.CentralFetchBaseUrl, msg.DeploymentId, msg.FetchToken, CancellationToken.None)
+            .ContinueWith(async t =>
            {
-                if (t.IsFaulted)
-                    _logger.LogError(t.Exception, "Failed to apply replicated deploy for {Instance}", msg.InstanceName);
-            });
+                try
+                {
+                    if (t.IsCompletedSuccessfully)
+                    {
+                        await _storage.StoreDeployedConfigIfNewerAsync(
+                            msg.InstanceName, t.Result, msg.DeploymentId, msg.RevisionHash, msg.IsEnabled);
+                        return;
+                    }
+
+                    var ex = t.Exception?.GetBaseException();
+                    if (ex is DeploymentConfigFetchException { IsSuperseded: true })
+                        _logger.LogInformation(
+                            "Skip replicated config for {Instance}: superseded/expired (a newer deploy will replicate)",
+                            msg.InstanceName);
+                    else if (t.IsCanceled)
+                        _logger.LogWarning(
+                            "Replicated config fetch cancelled for {Instance} (deployment {DeploymentId})",
+                            msg.InstanceName, msg.DeploymentId);
+                    else
+                        _logger.LogError(ex,
+                            "Replicated config fetch failed for {Instance} (deployment {DeploymentId})",
+                            msg.InstanceName, msg.DeploymentId);
+                }
+                catch (Exception writeEx)
+                {
+                    // Guarded-write failure is best-effort; observe + log so nothing faults silently.
+                    _logger.LogError(writeEx,
+                        "Failed to write replicated config for {Instance} (deployment {DeploymentId})",
+                        msg.InstanceName, msg.DeploymentId);
+                }
+            })
+            .Unwrap();
    }

    private void HandleApplyConfigRemove(ApplyConfigRemove msg)