feat(site): replicate config by id + standby fetch (kills the intra-site frame trap)

This commit is contained in:
Joseph Doherty
2026-06-26 14:13:58 -04:00
parent 631ce5bfce
commit 5c2db9fe70
6 changed files with 349 additions and 26 deletions
@@ -2,6 +2,7 @@ using Akka.Actor;
using Akka.Cluster;
using Akka.Event;
using Microsoft.Extensions.Logging;
using ZB.MOM.WW.ScadaBridge.SiteRuntime.Deployment;
using ZB.MOM.WW.ScadaBridge.SiteRuntime.Messages;
using ZB.MOM.WW.ScadaBridge.SiteRuntime.Persistence;
using ZB.MOM.WW.ScadaBridge.StoreAndForward;
@@ -21,6 +22,7 @@ public class SiteReplicationActor : ReceiveActor
private readonly SiteStorageService _storage;
private readonly StoreAndForwardStorage _sfStorage;
private readonly ReplicationService _replicationService;
private readonly IDeploymentConfigFetcher? _configFetcher;
private readonly string _siteRole;
private readonly ILogger<SiteReplicationActor> _logger;
private readonly Cluster _cluster;
@@ -34,16 +36,24 @@ public class SiteReplicationActor : ReceiveActor
/// <param name="replicationService">Service providing replication transport logic.</param>
/// <param name="siteRole">Akka cluster role used to identify peer nodes to replicate to.</param>
/// <param name="logger">Logger instance.</param>
/// <param name="configFetcher">
/// Fetches a deployed instance's config JSON from central over HTTP. Used by the
/// notify-and-fetch standby apply path (<see cref="HandleApplyConfigDeploy"/>): the peer
/// replicates only the deployment id, and the standby fetches the config itself so a large
/// config never crosses the intra-site Akka hop. Null on nodes/tests without a fetcher.
/// </param>
public SiteReplicationActor(
SiteStorageService storage,
StoreAndForwardStorage sfStorage,
ReplicationService replicationService,
string siteRole,
ILogger<SiteReplicationActor> logger)
ILogger<SiteReplicationActor> logger,
IDeploymentConfigFetcher? configFetcher = null)
{
_storage = storage;
_sfStorage = sfStorage;
_replicationService = replicationService;
_configFetcher = configFetcher;
_siteRole = siteRole;
_logger = logger;
_cluster = Cluster.Get(Context.System);
@@ -55,7 +65,8 @@ public class SiteReplicationActor : ReceiveActor
// Outbound — forward to peer
Receive<ReplicateConfigDeploy>(msg => SendToPeer(new ApplyConfigDeploy(
msg.InstanceName, msg.ConfigJson, msg.DeploymentId, msg.RevisionHash, msg.IsEnabled)));
msg.InstanceName, msg.DeploymentId, msg.RevisionHash, msg.IsEnabled,
msg.CentralFetchBaseUrl, msg.FetchToken)));
Receive<ReplicateConfigRemove>(msg => SendToPeer(new ApplyConfigRemove(msg.InstanceName)));
Receive<ReplicateConfigSetEnabled>(msg => SendToPeer(new ApplyConfigSetEnabled(
msg.InstanceName, msg.IsEnabled)));
@@ -120,7 +131,12 @@ public class SiteReplicationActor : ReceiveActor
}
}
private void SendToPeer(object message)
/// <summary>
/// Forwards a replication message to the tracked peer node's <c>site-replication</c> actor
/// (fire-and-forget, dropped when no peer is tracked). <see langword="protected virtual"/>
/// so tests can intercept the peer send without standing up a real two-node cluster.
/// </summary>
protected virtual void SendToPeer(object message)
{
if (_peerAddress == null)
{
@@ -136,14 +152,68 @@ public class SiteReplicationActor : ReceiveActor
private void HandleApplyConfigDeploy(ApplyConfigDeploy msg)
{
_logger.LogInformation("Applying replicated config deploy for {Instance}", msg.InstanceName);
_storage.StoreDeployedConfigAsync(
msg.InstanceName, msg.ConfigJson, msg.DeploymentId, msg.RevisionHash, msg.IsEnabled)
.ContinueWith(t =>
if (string.IsNullOrEmpty(msg.CentralFetchBaseUrl))
{
// The still-present direct DeployInstanceCommand wire path (retired in Task 14)
// replicates with empty coords; there is nothing to fetch. Skip quietly rather
// than calling FetchAsync("") and logging an error — T18 reconciliation backstops.
_logger.LogDebug(
"No fetch coords for {Instance} (deployment {DeploymentId}) — skipping replicated fetch; T18 reconciliation is the backstop",
msg.InstanceName, msg.DeploymentId);
return;
}
if (_configFetcher is null)
{
_logger.LogWarning(
"No config fetcher available; cannot apply replicated config for {Instance} (deployment {DeploymentId}) — reconciliation will backstop",
msg.InstanceName, msg.DeploymentId);
return;
}
_logger.LogInformation(
"Replicating config for {Instance} (deployment {DeploymentId}) — fetching from central",
msg.InstanceName, msg.DeploymentId);
// Notify-and-fetch: the peer sent only the id, so the standby fetches the config
// itself (off-thread; best-effort fire-and-forget, matching the no-ack replication
// model). The guarded write only overwrites a strictly-older local row. A single
// fetch attempt — T18 reconciliation is the durable backstop for a lost fetch.
_configFetcher.FetchAsync(msg.CentralFetchBaseUrl, msg.DeploymentId, msg.FetchToken, CancellationToken.None)
.ContinueWith(async t =>
{
if (t.IsFaulted)
_logger.LogError(t.Exception, "Failed to apply replicated deploy for {Instance}", msg.InstanceName);
});
try
{
if (t.IsCompletedSuccessfully)
{
await _storage.StoreDeployedConfigIfNewerAsync(
msg.InstanceName, t.Result, msg.DeploymentId, msg.RevisionHash, msg.IsEnabled);
return;
}
var ex = t.Exception?.GetBaseException();
if (ex is DeploymentConfigFetchException { IsSuperseded: true })
_logger.LogInformation(
"Skip replicated config for {Instance}: superseded/expired (a newer deploy will replicate)",
msg.InstanceName);
else if (t.IsCanceled)
_logger.LogWarning(
"Replicated config fetch cancelled for {Instance} (deployment {DeploymentId})",
msg.InstanceName, msg.DeploymentId);
else
_logger.LogError(ex,
"Replicated config fetch failed for {Instance} (deployment {DeploymentId})",
msg.InstanceName, msg.DeploymentId);
}
catch (Exception writeEx)
{
// Guarded-write failure is best-effort; observe + log so nothing faults silently.
_logger.LogError(writeEx,
"Failed to write replicated config for {Instance} (deployment {DeploymentId})",
msg.InstanceName, msg.DeploymentId);
}
})
.Unwrap();
}
private void HandleApplyConfigRemove(ApplyConfigRemove msg)