feat(deploy): stage pending config + send RefreshDeploymentCommand (notify-and-fetch)

This commit is contained in:
Joseph Doherty
2026-06-26 12:56:58 -04:00
parent 25f768f379
commit 10f752df02
4 changed files with 243 additions and 12 deletions
@@ -9,6 +9,7 @@ using ZB.MOM.WW.ScadaBridge.Commons.Messages.Deployment;
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Lifecycle;
using ZB.MOM.WW.ScadaBridge.Commons.Observability;
using ZB.MOM.WW.ScadaBridge.Commons.Types;
using ZB.MOM.WW.ScadaBridge.Commons.Types.Deployment;
using ZB.MOM.WW.ScadaBridge.Commons.Types.Enums;
using ZB.MOM.WW.ScadaBridge.Commons.Types.Flattening;
using ZB.MOM.WW.ScadaBridge.Communication;
@@ -25,7 +26,7 @@ namespace ZB.MOM.WW.ScadaBridge.DeploymentManager;
/// 3. Flatten configuration via TemplateEngine (captures template state at time of flatten -- WP-16)
/// 4. Validate flattened configuration
/// 5. Compute revision hash and diff
/// 6. Send DeployInstanceCommand to site via CommunicationService
/// 6. Stage a PendingDeployment row + send RefreshDeploymentCommand (notify-and-fetch; site fetches the config over HTTP)
/// 7. Track deployment status with optimistic concurrency (WP-4)
/// 8. Store deployed config snapshot (WP-8)
/// 9. Audit log all actions
@@ -45,6 +46,7 @@ public class DeploymentService
private readonly RevisionHashService _revisionHashService;
private readonly IDeploymentStatusNotifier _statusNotifier;
private readonly DeploymentManagerOptions _options;
private readonly CommunicationOptions _commOptions;
private readonly ILogger<DeploymentService> _logger;
/// <summary>
@@ -72,6 +74,13 @@ public class DeploymentService
/// </param>
/// <param name="statusNotifier">Notifier for pushing deployment status changes to the UI.</param>
/// <param name="options">Deployment manager configuration options.</param>
/// <param name="communicationOptions">
/// Central-site communication options. The notify-and-fetch deploy path reads
/// <see cref="CommunicationOptions.CentralFetchBaseUrl"/> (carried in the
/// <c>RefreshDeploymentCommand</c> so sites need no standing config) and
/// <see cref="CommunicationOptions.PendingDeploymentTtl"/> (the staged
/// <c>PendingDeployment</c> row's lifetime).
/// </param>
/// <param name="logger">Logger instance.</param>
public DeploymentService(
IDeploymentManagerRepository repository,
@@ -84,6 +93,7 @@ public class DeploymentService
RevisionHashService revisionHashService,
IDeploymentStatusNotifier statusNotifier,
IOptions<DeploymentManagerOptions> options,
IOptions<CommunicationOptions> communicationOptions,
ILogger<DeploymentService> logger)
{
_repository = repository;
@@ -96,6 +106,7 @@ public class DeploymentService
_revisionHashService = revisionHashService;
_statusNotifier = statusNotifier;
_options = options.Value;
_commOptions = communicationOptions.Value;
_logger = logger;
}
@@ -209,6 +220,16 @@ public class DeploymentService
if (reconciled != null)
return Result<DeploymentRecord>.Success(reconciled);
// Notify-and-fetch: the site fetches the staged config from
// CentralFetchBaseUrl, so a deploy is impossible without it. Fail fast
// here — BEFORE creating an InProgress record or staging a pending row —
// so the operator sees a clear configuration error instead of a
// confusing downstream site-fetch failure (and no InProgress record is
// stranded).
if (string.IsNullOrEmpty(_commOptions.CentralFetchBaseUrl))
return Result<DeploymentRecord>.Failure(
"CentralFetchBaseUrl is not configured — required for deployment (notify-and-fetch).");
// WP-4: Create the deployment record directly in InProgress.
//
// DeploymentManager-022: the previous code wrote the record as Pending,
@@ -236,16 +257,39 @@ public class DeploymentService
try
{
// WP-1: Send to site via CommunicationService
// Notify-and-fetch: instead of shipping the (potentially oversized,
// silently-dropped >128 KB) flattened config inline in a
// DeployInstanceCommand, stage it in a PendingDeployment row and send
// a small RefreshDeploymentCommand. The site fetches the config from
// CentralFetchBaseUrl over HTTP using the per-deployment fetch token.
var siteId = await ResolveSiteIdentifierAsync(instance.SiteId, cancellationToken);
var command = new DeployInstanceCommand(
deploymentId, instance.UniqueName, revisionHash, configJson, user, DateTimeOffset.UtcNow);
var token = DeploymentFetchToken.Generate();
var stagedAt = DateTimeOffset.UtcNow;
await _repository.AddPendingDeploymentAsync(new PendingDeployment(
deploymentId, instanceId, revisionHash, configJson, token,
stagedAt, stagedAt + _commOptions.PendingDeploymentTtl), cancellationToken);
await _repository.SaveChangesAsync(cancellationToken);
var command = new RefreshDeploymentCommand(
deploymentId, instance.UniqueName, revisionHash, user, stagedAt,
_commOptions.CentralFetchBaseUrl, token);
_logger.LogInformation(
"Sending deployment {DeploymentId} for instance {Instance} to site {SiteId}",
"Sending deployment {DeploymentId} for instance {Instance} to site {SiteId} (notify-and-fetch)",
deploymentId, instance.UniqueName, siteId);
var response = await _communicationService.DeployInstanceAsync(siteId, command, cancellationToken);
// Cleanup of the staged PendingDeployment is TTL-based ONLY — the row
// is deliberately NOT deleted on success or in the catch. On a
// central-side Ask timeout the site may have applied AND told the
// standby node to fetch; deleting now would 404 that in-flight
// standby fetch and break failover. Supersession bounds pending rows
// to ≤1 per instance and the fetch endpoint enforces the TTL, so
// leaving rows for TTL purge is safe.
// TODO(notify-and-fetch): wire PurgeExpiredPendingDeploymentsAsync
// into a central maintenance cadence (none exists in DeploymentManager
// today; deferred — supersession + endpoint TTL keep this safe).
var response = await _communicationService.RefreshDeploymentAsync(siteId, command, cancellationToken);
// WP-1: Update status based on site response.
record.Status = response.Status;
@@ -320,6 +364,12 @@ public class DeploymentService
try
{
await _repository.UpdateDeploymentRecordAsync(record, CancellationToken.None);
// Note: if the staging SaveChangesAsync above was interrupted, an
// Added PendingDeployment may still be tracked and will be
// committed by this cleanup save. That row is orphaned (no
// RefreshDeploymentCommand was sent, so no site holds its token)
// and is removed by TTL purge / superseded by the next deploy --
// harmless.
await _repository.SaveChangesAsync(CancellationToken.None);
NotifyStatusChange(record);