feat(deployment-manager): resolve DeploymentManager-006 — query site deployment state before redeploy and reconcile

Adds DeploymentStateQuery request/response contracts (Commons), a site-side
handler (SiteRuntime), a CommunicationService query method (Communication), and
reconciliation in DeploymentService: when a prior record is InProgress or
Failed-on-timeout, query the site; if it already holds the target revision hash
mark the record Success without re-sending; on query failure fall through to a
normal deploy (site-side stale-rejection is the safety net).
This commit is contained in:
Joseph Doherty
2026-05-16 20:12:24 -04:00
parent cac8aebe9f
commit bc548e1447
13 changed files with 662 additions and 19 deletions

View File

@@ -0,0 +1,13 @@
namespace ScadaLink.Commons.Messages.Deployment;
/// <summary>
/// Central→site query for the currently-applied deployment state of a single
/// instance. Issued by the Deployment Manager before a re-deploy when a prior
/// deployment record is stuck <c>InProgress</c> or <c>Failed</c> due to a
/// timeout, so the site's actual state can be reconciled against the target
/// revision before re-sending a deployment ("Deployment Identity &amp; Idempotency").
/// </summary>
public record DeploymentStateQueryRequest(
string CorrelationId,
string InstanceUniqueName,
DateTimeOffset Timestamp);

View File

@@ -0,0 +1,15 @@
namespace ScadaLink.Commons.Messages.Deployment;
/// <summary>
/// Site→central response carrying the instance's currently-applied deployment
/// state. If <see cref="IsDeployed"/> is <c>false</c> the instance has no
/// deployed configuration at the site and <see cref="AppliedDeploymentId"/> /
/// <see cref="AppliedRevisionHash"/> are <c>null</c>.
/// </summary>
public record DeploymentStateQueryResponse(
string CorrelationId,
string InstanceUniqueName,
bool IsDeployed,
string? AppliedDeploymentId,
string? AppliedRevisionHash,
DateTimeOffset Timestamp);

View File

@@ -76,6 +76,11 @@ public class SiteCommunicationActor : ReceiveActor, IWithTimers
Receive<EnableInstanceCommand>(msg => _deploymentManagerProxy.Forward(msg));
Receive<DeleteInstanceCommand>(msg => _deploymentManagerProxy.Forward(msg));
// DeploymentManager-006: query-the-site-before-redeploy — forward to
// the Deployment Manager, which owns the deployed-config store and
// answers with the instance's currently-applied deployment identity.
Receive<DeploymentStateQueryRequest>(msg => _deploymentManagerProxy.Forward(msg));
// Pattern 3: Artifact Deployment — forward to artifact handler if registered
Receive<DeployArtifactsCommand>(msg =>
{

View File

@@ -73,6 +73,26 @@ public class CommunicationService
envelope, _options.DeploymentTimeout, cancellationToken);
}
/// <summary>
/// DeploymentManager-006: queries a site for the currently-applied deployment
/// identity of a single instance. Used by the Deployment Manager before a
/// re-deploy to reconcile against the site's actual state. Sent over the
/// existing ClusterClient command/control transport; the Ask times out (no
/// central buffering) if the site is unreachable, and the caller falls
/// through to a normal deploy.
/// </summary>
public async Task<DeploymentStateQueryResponse> QueryDeploymentStateAsync(
string siteId, DeploymentStateQueryRequest request, CancellationToken cancellationToken = default)
{
_logger.LogDebug(
"Sending DeploymentStateQueryRequest to site {SiteId}, instance={Instance}, correlationId={CorrelationId}",
siteId, request.InstanceUniqueName, request.CorrelationId);
var envelope = new SiteEnvelope(siteId, request);
return await GetActor().Ask<DeploymentStateQueryResponse>(
envelope, _options.QueryTimeout, cancellationToken);
}
// ── Pattern 2: Lifecycle ──
public async Task<InstanceLifecycleResponse> DisableInstanceAsync(

View File

@@ -43,6 +43,14 @@ public class DeploymentService
private readonly DeploymentManagerOptions _options;
private readonly ILogger<DeploymentService> _logger;
/// <summary>
/// Prefix written to <see cref="DeploymentRecord.ErrorMessage"/> when a
/// deployment fails because the site command timed out or was cancelled.
/// Used by the query-before-redeploy trigger (DeploymentManager-006) to tell
/// a timeout-induced failure apart from other deployment errors.
/// </summary>
private const string TimeoutFailurePrefix = "Communication failure:";
public DeploymentService(
IDeploymentManagerRepository repository,
ISiteRepository siteRepository,
@@ -118,6 +126,18 @@ public class DeploymentService
return Result<DeploymentRecord>.Failure($"Pre-deployment validation failed: {errors}");
}
// DeploymentManager-006: query-the-site-before-redeploy idempotency.
// If a prior deployment for this instance is stuck InProgress or Failed
// due to a timeout, the site may have actually applied the config. Query
// the site for its currently-applied revision before re-sending so a
// duplicate deployment is not produced (design: "Deployment Identity &
// Idempotency"). A clean prior Success or a fresh first-time deploy
// skips this extra round-trip.
var reconciled = await TryReconcileWithSiteAsync(
instance, revisionHash, cancellationToken);
if (reconciled != null)
return Result<DeploymentRecord>.Success(reconciled);
// Serialize for transmission
var configJson = JsonSerializer.Serialize(flattenedConfig);
@@ -199,7 +219,7 @@ public class DeploymentService
record.Status = DeploymentStatus.Failed;
record.ErrorMessage = isTimeout
? $"Communication failure: {ex.Message}"
? $"{TimeoutFailurePrefix} {ex.Message}"
: $"Deployment error: {ex.Message}";
record.CompletedAt = DateTimeOffset.UtcNow;
@@ -401,6 +421,105 @@ public class DeploymentService
return await _repository.GetDeploymentByDeploymentIdAsync(deploymentId, cancellationToken);
}
/// <summary>
/// DeploymentManager-006: query-the-site-before-redeploy reconciliation.
///
/// The site query is issued ONLY when a prior <see cref="DeploymentRecord"/>
/// for this instance is stuck <see cref="DeploymentStatus.InProgress"/>, or
/// is <see cref="DeploymentStatus.Failed"/> due to a timeout — the only
/// cases where the site may have applied the config without central
/// learning of it. Fresh first-time deploys and redeploys after a clean
/// prior <see cref="DeploymentStatus.Success"/> skip the extra round-trip.
///
/// Reconciliation: if the site already has the TARGET revision hash, the
/// prior record is marked <see cref="DeploymentStatus.Success"/> and
/// returned (the caller must NOT re-send the deploy). Otherwise <c>null</c>
/// is returned and the normal deploy proceeds.
///
/// Query failure: if the site is unreachable or the query times out, this
/// returns <c>null</c> (fall through to a normal deploy) — site-side
/// stale-rejection of an older revision hash is the safety net. The deploy
/// is never aborted on a failed query.
/// </summary>
private async Task<DeploymentRecord?> TryReconcileWithSiteAsync(
Instance instance,
string targetRevisionHash,
CancellationToken cancellationToken)
{
var prior = await _repository.GetCurrentDeploymentStatusAsync(instance.Id, cancellationToken);
if (prior == null || !ShouldQuerySiteBeforeRedeploy(prior))
return null;
DeploymentStateQueryResponse response;
try
{
var siteId = await ResolveSiteIdentifierAsync(instance.SiteId, cancellationToken);
var query = new DeploymentStateQueryRequest(
Guid.NewGuid().ToString("N"), instance.UniqueName, DateTimeOffset.UtcNow);
_logger.LogInformation(
"Querying site {SiteId} for applied deployment state of instance {Instance} " +
"before re-deploy (prior record {DeploymentId} is {Status})",
siteId, instance.UniqueName, prior.DeploymentId, prior.Status);
response = await _communicationService.QueryDeploymentStateAsync(
siteId, query, cancellationToken);
}
catch (Exception ex)
{
// Query failure (site unreachable / timeout): do NOT abort. Fall
// through to a normal deploy; site-side stale-rejection of an older
// revision hash is the safety net.
_logger.LogWarning(ex,
"Site query before re-deploy of instance {Instance} failed; " +
"proceeding with normal deploy (site-side stale-rejection is the safety net)",
instance.UniqueName);
return null;
}
if (response.IsDeployed &&
string.Equals(response.AppliedRevisionHash, targetRevisionHash, StringComparison.Ordinal))
{
// The site already has the target revision — the prior deployment
// actually succeeded. Reconcile the stale record instead of
// re-sending the deploy.
_logger.LogInformation(
"Site already has target revision {RevisionHash} for instance {Instance}; " +
"marking prior deployment record {DeploymentId} Success without re-deploying",
targetRevisionHash, instance.UniqueName, prior.DeploymentId);
prior.Status = DeploymentStatus.Success;
prior.ErrorMessage = null;
prior.CompletedAt = DateTimeOffset.UtcNow;
await _repository.UpdateDeploymentRecordAsync(prior, cancellationToken);
await _repository.SaveChangesAsync(cancellationToken);
await _auditService.LogAsync(prior.DeployedBy, "DeployReconciled", "Instance",
instance.Id.ToString(), instance.UniqueName,
new { DeploymentId = prior.DeploymentId, RevisionHash = targetRevisionHash },
cancellationToken);
return prior;
}
// Site does not have the target revision (or is not deployed) — proceed
// with the normal deploy.
return null;
}
/// <summary>
/// DeploymentManager-006: the site is queried before a re-deploy only when a
/// prior record is stuck <see cref="DeploymentStatus.InProgress"/>, or is
/// <see cref="DeploymentStatus.Failed"/> because the site command timed out
/// (detected via the <see cref="TimeoutFailurePrefix"/> error-message
/// marker). All other prior states skip the query.
/// </summary>
private static bool ShouldQuerySiteBeforeRedeploy(DeploymentRecord prior) =>
prior.Status == DeploymentStatus.InProgress
|| (prior.Status == DeploymentStatus.Failed
&& prior.ErrorMessage != null
&& prior.ErrorMessage.StartsWith(TimeoutFailurePrefix, StringComparison.Ordinal));
private async Task StoreDeployedSnapshotAsync(
int instanceId,
string deploymentId,

View File

@@ -78,6 +78,12 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers
Receive<EnableInstanceCommand>(HandleEnable);
Receive<DeleteInstanceCommand>(HandleDelete);
// DeploymentManager-006: query-the-site-before-redeploy idempotency.
// Central asks for the instance's currently-applied deployment identity
// before re-sending a deployment whose prior record is stuck InProgress
// or Failed due to a timeout.
Receive<DeploymentStateQueryRequest>(HandleDeploymentStateQuery);
// WP-33: Handle system-wide artifact deployment
Receive<DeployArtifactsCommand>(HandleDeployArtifacts);
@@ -446,6 +452,44 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers
_logger.LogInformation("Instance {Instance} deleted", instanceName);
}
/// <summary>
/// DeploymentManager-006: answers a central query for the instance's
/// currently-applied deployment identity. The site's deployed-config store
/// (SQLite) is the authoritative record — it covers both enabled and
/// disabled instances, and survives node restart/failover. If the instance
/// has no stored config, the response reports <c>IsDeployed = false</c> with
/// null identity so central falls through to a normal deploy.
/// </summary>
private void HandleDeploymentStateQuery(DeploymentStateQueryRequest request)
{
var sender = Sender;
var instanceName = request.InstanceUniqueName;
_storage.GetAllDeployedConfigsAsync().ContinueWith(t =>
{
if (!t.IsCompletedSuccessfully)
{
_logger.LogError(
t.Exception?.GetBaseException(),
"Failed to read deployed configs for deployment state query of {Instance}",
instanceName);
// Treat a storage read failure as "unknown" — central falls
// through to a normal deploy and relies on site-side
// stale-rejection as the safety net.
return new DeploymentStateQueryResponse(
request.CorrelationId, instanceName, false, null, null, DateTimeOffset.UtcNow);
}
var config = t.Result.FirstOrDefault(c => c.InstanceUniqueName == instanceName);
return config == null
? new DeploymentStateQueryResponse(
request.CorrelationId, instanceName, false, null, null, DateTimeOffset.UtcNow)
: new DeploymentStateQueryResponse(
request.CorrelationId, instanceName, true,
config.DeploymentId, config.RevisionHash, DateTimeOffset.UtcNow);
}).PipeTo(sender);
}
// ── DCL connection management ──
private readonly HashSet<string> _createdConnections = new();