feat(site): DeploymentManagerActor fetches config then applies (notify-and-fetch)

This commit is contained in:
Joseph Doherty
2026-06-26 13:47:28 -04:00
parent 3955cb4f28
commit 631ce5bfce
3 changed files with 257 additions and 9 deletions
@@ -839,6 +839,12 @@ akka {{
_logger.LogInformation("SiteReplicationActor created and S&F replication handler wired");
// Notify-and-fetch (Task 10): the active singleton fetches a deployment's
// flattened config from central over HTTP when a RefreshDeploymentCommand
// arrives. Resolve the fetcher from the same provider the actor already uses.
var deploymentConfigFetcher =
_serviceProvider.GetService<ZB.MOM.WW.ScadaBridge.SiteRuntime.Deployment.IDeploymentConfigFetcher>();
// Create the Deployment Manager as a cluster singleton
var singletonProps = ClusterSingletonManager.Props(
singletonProps: Props.Create(() => new DeploymentManagerActor(
@@ -851,7 +857,9 @@ akka {{
dclManager,
replicationActor,
siteHealthCollector,
_serviceProvider)),
_serviceProvider,
null,
deploymentConfigFetcher)),
terminationMessage: PoisonPill.Instance,
settings: ClusterSingletonManagerSettings.Create(_actorSystem!)
.WithRole(siteRole)
@@ -13,6 +13,7 @@ using ZB.MOM.WW.ScadaBridge.Commons.Messages.ScriptExecution;
using ZB.MOM.WW.ScadaBridge.Commons.Types.Enums;
using ZB.MOM.WW.ScadaBridge.HealthMonitoring;
using ZB.MOM.WW.ScadaBridge.SiteEventLogging;
using ZB.MOM.WW.ScadaBridge.SiteRuntime.Deployment;
using ZB.MOM.WW.ScadaBridge.SiteRuntime.Messages;
using ZB.MOM.WW.ScadaBridge.SiteRuntime.Persistence;
using ZB.MOM.WW.ScadaBridge.SiteRuntime.Scripts;
@@ -50,6 +51,12 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers
private readonly IActorRef? _replicationActor;
private readonly ISiteHealthCollector? _healthCollector;
private readonly IServiceProvider? _serviceProvider;
/// <summary>
/// Notify-and-fetch (Task 10): fetches a deployment's flattened config from central
/// over HTTP when a <see cref="RefreshDeploymentCommand"/> arrives. Optional — null on
/// nodes/tests that never receive a refresh; the active site path supplies it via DI.
/// </summary>
private readonly IDeploymentConfigFetcher? _configFetcher;
private readonly Dictionary<string, IActorRef> _instanceActors = new();
/// <summary>
/// Tracks Instance Actors that are terminating as part of a redeployment, keyed by
@@ -97,6 +104,11 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers
/// <param name="healthCollector">Optional site health collector.</param>
/// <param name="serviceProvider">Optional service provider for resolving per-instance services.</param>
/// <param name="loggerFactory">Optional logger factory for creating Instance Actor loggers.</param>
/// <param name="configFetcher">
/// Optional notify-and-fetch config fetcher (Task 10). Required for the
/// <see cref="RefreshDeploymentCommand"/> path; null on nodes/tests that never
/// receive a refresh.
/// </param>
public DeploymentManagerActor(
SiteStorageService storage,
ScriptCompilationService compilationService,
@@ -108,7 +120,8 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers
IActorRef? replicationActor = null,
ISiteHealthCollector? healthCollector = null,
IServiceProvider? serviceProvider = null,
ILoggerFactory? loggerFactory = null)
ILoggerFactory? loggerFactory = null,
IDeploymentConfigFetcher? configFetcher = null)
{
_storage = storage;
_compilationService = compilationService;
@@ -119,6 +132,7 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers
_replicationActor = replicationActor;
_healthCollector = healthCollector;
_serviceProvider = serviceProvider;
_configFetcher = configFetcher;
_logger = logger;
// SiteRuntime-015: reuse a single logger factory for all Instance Actors.
// Prefer an explicitly injected factory, fall back to one resolved from
@@ -129,11 +143,19 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers
?? Microsoft.Extensions.Logging.Abstractions.NullLoggerFactory.Instance;
// Lifecycle commands
Receive<DeployInstanceCommand>(HandleDeploy);
Receive<DeployInstanceCommand>(cmd => HandleDeploy(cmd, Sender));
Receive<DisableInstanceCommand>(HandleDisable);
Receive<EnableInstanceCommand>(HandleEnable);
Receive<DeleteInstanceCommand>(HandleDelete);
// Notify-and-fetch (Task 10): central sends a small RefreshDeploymentCommand;
// the active singleton fetches the flattened config over HTTP, then reuses the
// existing apply path. The two internal results carry the fetched config (or the
// fetch error) back onto the actor thread along with the captured original sender.
Receive<RefreshDeploymentCommand>(HandleRefreshDeployment);
Receive<RefreshFetched>(HandleRefreshFetched);
Receive<RefreshFetchFailed>(HandleRefreshFetchFailed);
// DeploymentManager-006: query-the-site-before-redeploy idempotency.
// Central asks for the instance's currently-applied deployment identity
// before re-sending a deployment whose prior record is stuck InProgress
@@ -356,7 +378,15 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers
/// terminating child is watched and the in-flight command is buffered until the
/// <see cref="Terminated"/> signal arrives.
/// </summary>
private void HandleDeploy(DeployInstanceCommand command)
/// <param name="replyTo">
/// The actor to reply to with the eventual <see cref="DeploymentStatusResponse"/>.
/// Passed explicitly (rather than read from <see cref="ActorBase.Sender"/>) so the
/// notify-and-fetch path (<see cref="HandleRefreshFetched"/>) can supply the ORIGINAL
/// central sender after the async config fetch, where <c>Sender</c> is no longer valid.
/// The redeploy-buffer path carries it on <see cref="PendingRedeploy"/> so the buffered
/// apply still replies to the right actor.
/// </param>
private void HandleDeploy(DeployInstanceCommand command, IActorRef replyTo)
{
var instanceName = command.InstanceUniqueName;
_logger.LogInformation(
@@ -369,7 +399,7 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers
if (_instanceActors.TryGetValue(instanceName, out var existing))
{
_instanceActors.Remove(instanceName);
_pendingRedeploys[existing] = new PendingRedeploy(command, Sender);
_pendingRedeploys[existing] = new PendingRedeploy(command, replyTo);
_terminatingActorsByName[instanceName] = existing;
Context.Watch(existing);
Context.Stop(existing);
@@ -396,12 +426,12 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers
$"superseded by newer deployment {command.DeploymentId} before predecessor finished terminating",
DateTimeOffset.UtcNow));
}
_pendingRedeploys[terminatingRef] = new PendingRedeploy(command, Sender);
_pendingRedeploys[terminatingRef] = new PendingRedeploy(command, replyTo);
return;
}
// Fresh deployment — no existing actor to replace.
ApplyDeployment(command, Sender, isRedeploy: false);
ApplyDeployment(command, replyTo, isRedeploy: false);
}
/// <summary>
@@ -423,6 +453,72 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers
ApplyDeployment(pending.Command, pending.OriginalSender, isRedeploy: true);
}
/// <summary>
/// Notify-and-fetch (Task 10): handles a small central→site
/// <see cref="RefreshDeploymentCommand"/>. Fetches the deployment's flattened config
/// from central over HTTP via <see cref="IDeploymentConfigFetcher"/>, then pipes the
/// result back to self so the existing apply path runs on the actor thread with the
/// ORIGINAL sender preserved (the central Ask's temp actor). The reply is the existing
/// <see cref="DeploymentStatusResponse"/>, so the central deploy completes unchanged.
/// </summary>
private void HandleRefreshDeployment(RefreshDeploymentCommand cmd)
{
// Capture the Ask temp-actor sender BEFORE the async continuation: Akka's Sender
// is only valid during synchronous message handling and is no longer the original
// sender once the ContinueWith/PipeTo continuation runs on a thread-pool thread.
var replyTo = Sender;
if (_configFetcher is null)
{
replyTo.Tell(new DeploymentStatusResponse(
cmd.DeploymentId, cmd.InstanceUniqueName, DeploymentStatus.Failed,
"Deployment config fetcher not available on this node.", DateTimeOffset.UtcNow));
return;
}
_logger.LogInformation(
"Fetching config for deployment {DeploymentId} instance {Instance} (notify-and-fetch)",
cmd.DeploymentId, cmd.InstanceUniqueName);
// CancellationToken.None: the fetch is bounded by HttpClient.Timeout. On a singleton
// handover mid-fetch the PipeTo lands in dead letters and the central Ask times out
// (then reconciles) — acceptable, rare.
_configFetcher.FetchAsync(cmd.CentralFetchBaseUrl, cmd.DeploymentId, cmd.FetchToken, CancellationToken.None)
.ContinueWith(t => t.IsCompletedSuccessfully
? (object)new RefreshFetched(cmd, t.Result, replyTo)
: new RefreshFetchFailed(cmd, t.Exception?.GetBaseException().Message ?? "fetch failed", replyTo))
.PipeTo(Self);
}
/// <summary>
/// Notify-and-fetch (Task 10): the config fetch succeeded — reconstruct the in-process
/// <see cref="DeployInstanceCommand"/> apply DTO and reuse the existing apply path,
/// threading the original central sender through so the
/// <see cref="DeploymentStatusResponse"/> reaches it.
/// </summary>
private void HandleRefreshFetched(RefreshFetched msg)
{
var command = new DeployInstanceCommand(
msg.Cmd.DeploymentId, msg.Cmd.InstanceUniqueName, msg.Cmd.RevisionHash,
msg.ConfigJson, msg.Cmd.DeployedBy, msg.Cmd.Timestamp);
HandleDeploy(command, msg.ReplyTo);
}
/// <summary>
/// Notify-and-fetch (Task 10): the config fetch failed — report
/// <see cref="DeploymentStatus.Failed"/> to the original central sender so the deploy
/// completes (rather than the central Ask hanging to timeout). Nothing is applied.
/// </summary>
private void HandleRefreshFetchFailed(RefreshFetchFailed msg)
{
_logger.LogError(
"Config fetch failed for deployment {DeploymentId} instance {Instance}: {Error}",
msg.Cmd.DeploymentId, msg.Cmd.InstanceUniqueName, msg.Error);
msg.ReplyTo.Tell(new DeploymentStatusResponse(
msg.Cmd.DeploymentId, msg.Cmd.InstanceUniqueName, DeploymentStatus.Failed,
$"Communication failure: {msg.Error}", DateTimeOffset.UtcNow));
}
/// <summary>
/// Creates the Instance Actor, persists the config, and replies to the deployer.
/// A redeployment is an update of an existing instance, so the deployed-instance
@@ -1590,6 +1686,19 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers
/// </summary>
internal record PendingRedeploy(DeployInstanceCommand Command, IActorRef OriginalSender);
/// <summary>
/// Notify-and-fetch (Task 10): piped back to self when the deployment's flattened
/// config has been fetched from central. Carries the original central sender so the
/// reused apply path replies to it.
/// </summary>
private sealed record RefreshFetched(RefreshDeploymentCommand Cmd, string ConfigJson, IActorRef ReplyTo);
/// <summary>
/// Notify-and-fetch (Task 10): piped back to self when the config fetch failed.
/// Carries the original central sender so a Failed status is reported to it.
/// </summary>
private sealed record RefreshFetchFailed(RefreshDeploymentCommand Cmd, string Error, IActorRef ReplyTo);
/// <summary>
/// SiteRuntime-021: internal message dispatched from
/// <see cref="HandleDeployArtifacts"/>'s off-thread persistence task back