feat(site): replicate config by id + standby fetch (kills the intra-site frame trap)
This commit is contained in:
@@ -2,6 +2,7 @@ using Akka.Actor;
|
||||
using Akka.Cluster;
|
||||
using Akka.Event;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using ZB.MOM.WW.ScadaBridge.SiteRuntime.Deployment;
|
||||
using ZB.MOM.WW.ScadaBridge.SiteRuntime.Messages;
|
||||
using ZB.MOM.WW.ScadaBridge.SiteRuntime.Persistence;
|
||||
using ZB.MOM.WW.ScadaBridge.StoreAndForward;
|
||||
@@ -21,6 +22,7 @@ public class SiteReplicationActor : ReceiveActor
|
||||
private readonly SiteStorageService _storage;
|
||||
private readonly StoreAndForwardStorage _sfStorage;
|
||||
private readonly ReplicationService _replicationService;
|
||||
private readonly IDeploymentConfigFetcher? _configFetcher;
|
||||
private readonly string _siteRole;
|
||||
private readonly ILogger<SiteReplicationActor> _logger;
|
||||
private readonly Cluster _cluster;
|
||||
@@ -34,16 +36,24 @@ public class SiteReplicationActor : ReceiveActor
|
||||
/// <param name="replicationService">Service providing replication transport logic.</param>
|
||||
/// <param name="siteRole">Akka cluster role used to identify peer nodes to replicate to.</param>
|
||||
/// <param name="logger">Logger instance.</param>
|
||||
/// <param name="configFetcher">
|
||||
/// Fetches a deployed instance's config JSON from central over HTTP. Used by the
|
||||
/// notify-and-fetch standby apply path (<see cref="HandleApplyConfigDeploy"/>): the peer
|
||||
/// replicates only the deployment id, and the standby fetches the config itself so a large
|
||||
/// config never crosses the intra-site Akka hop. Null on nodes/tests without a fetcher.
|
||||
/// </param>
|
||||
public SiteReplicationActor(
|
||||
SiteStorageService storage,
|
||||
StoreAndForwardStorage sfStorage,
|
||||
ReplicationService replicationService,
|
||||
string siteRole,
|
||||
ILogger<SiteReplicationActor> logger)
|
||||
ILogger<SiteReplicationActor> logger,
|
||||
IDeploymentConfigFetcher? configFetcher = null)
|
||||
{
|
||||
_storage = storage;
|
||||
_sfStorage = sfStorage;
|
||||
_replicationService = replicationService;
|
||||
_configFetcher = configFetcher;
|
||||
_siteRole = siteRole;
|
||||
_logger = logger;
|
||||
_cluster = Cluster.Get(Context.System);
|
||||
@@ -55,7 +65,8 @@ public class SiteReplicationActor : ReceiveActor
|
||||
|
||||
// Outbound — forward to peer
|
||||
Receive<ReplicateConfigDeploy>(msg => SendToPeer(new ApplyConfigDeploy(
|
||||
msg.InstanceName, msg.ConfigJson, msg.DeploymentId, msg.RevisionHash, msg.IsEnabled)));
|
||||
msg.InstanceName, msg.DeploymentId, msg.RevisionHash, msg.IsEnabled,
|
||||
msg.CentralFetchBaseUrl, msg.FetchToken)));
|
||||
Receive<ReplicateConfigRemove>(msg => SendToPeer(new ApplyConfigRemove(msg.InstanceName)));
|
||||
Receive<ReplicateConfigSetEnabled>(msg => SendToPeer(new ApplyConfigSetEnabled(
|
||||
msg.InstanceName, msg.IsEnabled)));
|
||||
@@ -120,7 +131,12 @@ public class SiteReplicationActor : ReceiveActor
|
||||
}
|
||||
}
|
||||
|
||||
private void SendToPeer(object message)
|
||||
/// <summary>
|
||||
/// Forwards a replication message to the tracked peer node's <c>site-replication</c> actor
|
||||
/// (fire-and-forget, dropped when no peer is tracked). <see langword="protected virtual"/>
|
||||
/// so tests can intercept the peer send without standing up a real two-node cluster.
|
||||
/// </summary>
|
||||
protected virtual void SendToPeer(object message)
|
||||
{
|
||||
if (_peerAddress == null)
|
||||
{
|
||||
@@ -136,14 +152,68 @@ public class SiteReplicationActor : ReceiveActor
|
||||
|
||||
private void HandleApplyConfigDeploy(ApplyConfigDeploy msg)
|
||||
{
|
||||
_logger.LogInformation("Applying replicated config deploy for {Instance}", msg.InstanceName);
|
||||
_storage.StoreDeployedConfigAsync(
|
||||
msg.InstanceName, msg.ConfigJson, msg.DeploymentId, msg.RevisionHash, msg.IsEnabled)
|
||||
.ContinueWith(t =>
|
||||
if (string.IsNullOrEmpty(msg.CentralFetchBaseUrl))
|
||||
{
|
||||
// The still-present direct DeployInstanceCommand wire path (retired in Task 14)
|
||||
// replicates with empty coords; there is nothing to fetch. Skip quietly rather
|
||||
// than calling FetchAsync("") and logging an error — T18 reconciliation backstops.
|
||||
_logger.LogDebug(
|
||||
"No fetch coords for {Instance} (deployment {DeploymentId}) — skipping replicated fetch; T18 reconciliation is the backstop",
|
||||
msg.InstanceName, msg.DeploymentId);
|
||||
return;
|
||||
}
|
||||
|
||||
if (_configFetcher is null)
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"No config fetcher available; cannot apply replicated config for {Instance} (deployment {DeploymentId}) — reconciliation will backstop",
|
||||
msg.InstanceName, msg.DeploymentId);
|
||||
return;
|
||||
}
|
||||
|
||||
_logger.LogInformation(
|
||||
"Replicating config for {Instance} (deployment {DeploymentId}) — fetching from central",
|
||||
msg.InstanceName, msg.DeploymentId);
|
||||
|
||||
// Notify-and-fetch: the peer sent only the id, so the standby fetches the config
|
||||
// itself (off-thread; best-effort fire-and-forget, matching the no-ack replication
|
||||
// model). The guarded write only overwrites a strictly-older local row. A single
|
||||
// fetch attempt — T18 reconciliation is the durable backstop for a lost fetch.
|
||||
_configFetcher.FetchAsync(msg.CentralFetchBaseUrl, msg.DeploymentId, msg.FetchToken, CancellationToken.None)
|
||||
.ContinueWith(async t =>
|
||||
{
|
||||
if (t.IsFaulted)
|
||||
_logger.LogError(t.Exception, "Failed to apply replicated deploy for {Instance}", msg.InstanceName);
|
||||
});
|
||||
try
|
||||
{
|
||||
if (t.IsCompletedSuccessfully)
|
||||
{
|
||||
await _storage.StoreDeployedConfigIfNewerAsync(
|
||||
msg.InstanceName, t.Result, msg.DeploymentId, msg.RevisionHash, msg.IsEnabled);
|
||||
return;
|
||||
}
|
||||
|
||||
var ex = t.Exception?.GetBaseException();
|
||||
if (ex is DeploymentConfigFetchException { IsSuperseded: true })
|
||||
_logger.LogInformation(
|
||||
"Skip replicated config for {Instance}: superseded/expired (a newer deploy will replicate)",
|
||||
msg.InstanceName);
|
||||
else if (t.IsCanceled)
|
||||
_logger.LogWarning(
|
||||
"Replicated config fetch cancelled for {Instance} (deployment {DeploymentId})",
|
||||
msg.InstanceName, msg.DeploymentId);
|
||||
else
|
||||
_logger.LogError(ex,
|
||||
"Replicated config fetch failed for {Instance} (deployment {DeploymentId})",
|
||||
msg.InstanceName, msg.DeploymentId);
|
||||
}
|
||||
catch (Exception writeEx)
|
||||
{
|
||||
// Guarded-write failure is best-effort; observe + log so nothing faults silently.
|
||||
_logger.LogError(writeEx,
|
||||
"Failed to write replicated config for {Instance} (deployment {DeploymentId})",
|
||||
msg.InstanceName, msg.DeploymentId);
|
||||
}
|
||||
})
|
||||
.Unwrap();
|
||||
}
|
||||
|
||||
private void HandleApplyConfigRemove(ApplyConfigRemove msg)
|
||||
|
||||
Reference in New Issue
Block a user