feat(deployment-manager): resolve DeploymentManager-006 — query site deployment state before redeploy and reconcile
Adds DeploymentStateQuery request/response contracts (Commons), a site-side handler (SiteRuntime), a CommunicationService query method (Communication), and reconciliation in DeploymentService: when a prior record is InProgress or Failed-on-timeout, query the site; if it already holds the target revision hash mark the record Success without re-sending; on query failure fall through to a normal deploy (site-side stale-rejection is the safety net).
This commit is contained in:
@@ -43,6 +43,14 @@ public class DeploymentService
|
||||
private readonly DeploymentManagerOptions _options;
|
||||
private readonly ILogger<DeploymentService> _logger;
|
||||
|
||||
/// <summary>
|
||||
/// Prefix written to <see cref="DeploymentRecord.ErrorMessage"/> when a
|
||||
/// deployment fails because the site command timed out or was cancelled.
|
||||
/// Used by the query-before-redeploy trigger (DeploymentManager-006) to tell
|
||||
/// a timeout-induced failure apart from other deployment errors.
|
||||
/// </summary>
|
||||
private const string TimeoutFailurePrefix = "Communication failure:";
|
||||
|
||||
public DeploymentService(
|
||||
IDeploymentManagerRepository repository,
|
||||
ISiteRepository siteRepository,
|
||||
@@ -118,6 +126,18 @@ public class DeploymentService
|
||||
return Result<DeploymentRecord>.Failure($"Pre-deployment validation failed: {errors}");
|
||||
}
|
||||
|
||||
// DeploymentManager-006: query-the-site-before-redeploy idempotency.
|
||||
// If a prior deployment for this instance is stuck InProgress or Failed
|
||||
// due to a timeout, the site may have actually applied the config. Query
|
||||
// the site for its currently-applied revision before re-sending so a
|
||||
// duplicate deployment is not produced (design: "Deployment Identity &
|
||||
// Idempotency"). A clean prior Success or a fresh first-time deploy
|
||||
// skips this extra round-trip.
|
||||
var reconciled = await TryReconcileWithSiteAsync(
|
||||
instance, revisionHash, cancellationToken);
|
||||
if (reconciled != null)
|
||||
return Result<DeploymentRecord>.Success(reconciled);
|
||||
|
||||
// Serialize for transmission
|
||||
var configJson = JsonSerializer.Serialize(flattenedConfig);
|
||||
|
||||
@@ -199,7 +219,7 @@ public class DeploymentService
|
||||
|
||||
record.Status = DeploymentStatus.Failed;
|
||||
record.ErrorMessage = isTimeout
|
||||
? $"Communication failure: {ex.Message}"
|
||||
? $"{TimeoutFailurePrefix} {ex.Message}"
|
||||
: $"Deployment error: {ex.Message}";
|
||||
record.CompletedAt = DateTimeOffset.UtcNow;
|
||||
|
||||
@@ -401,6 +421,105 @@ public class DeploymentService
|
||||
return await _repository.GetDeploymentByDeploymentIdAsync(deploymentId, cancellationToken);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// DeploymentManager-006: query-the-site-before-redeploy reconciliation.
|
||||
///
|
||||
/// The site query is issued ONLY when a prior <see cref="DeploymentRecord"/>
|
||||
/// for this instance is stuck <see cref="DeploymentStatus.InProgress"/>, or
|
||||
/// is <see cref="DeploymentStatus.Failed"/> due to a timeout — the only
|
||||
/// cases where the site may have applied the config without central
|
||||
/// learning of it. Fresh first-time deploys and redeploys after a clean
|
||||
/// prior <see cref="DeploymentStatus.Success"/> skip the extra round-trip.
|
||||
///
|
||||
/// Reconciliation: if the site already has the TARGET revision hash, the
|
||||
/// prior record is marked <see cref="DeploymentStatus.Success"/> and
|
||||
/// returned (the caller must NOT re-send the deploy). Otherwise <c>null</c>
|
||||
/// is returned and the normal deploy proceeds.
|
||||
///
|
||||
/// Query failure: if the site is unreachable or the query times out, this
|
||||
/// returns <c>null</c> (fall through to a normal deploy) — site-side
|
||||
/// stale-rejection of an older revision hash is the safety net. The deploy
|
||||
/// is never aborted on a failed query.
|
||||
/// </summary>
|
||||
private async Task<DeploymentRecord?> TryReconcileWithSiteAsync(
|
||||
Instance instance,
|
||||
string targetRevisionHash,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
var prior = await _repository.GetCurrentDeploymentStatusAsync(instance.Id, cancellationToken);
|
||||
if (prior == null || !ShouldQuerySiteBeforeRedeploy(prior))
|
||||
return null;
|
||||
|
||||
DeploymentStateQueryResponse response;
|
||||
try
|
||||
{
|
||||
var siteId = await ResolveSiteIdentifierAsync(instance.SiteId, cancellationToken);
|
||||
var query = new DeploymentStateQueryRequest(
|
||||
Guid.NewGuid().ToString("N"), instance.UniqueName, DateTimeOffset.UtcNow);
|
||||
|
||||
_logger.LogInformation(
|
||||
"Querying site {SiteId} for applied deployment state of instance {Instance} " +
|
||||
"before re-deploy (prior record {DeploymentId} is {Status})",
|
||||
siteId, instance.UniqueName, prior.DeploymentId, prior.Status);
|
||||
|
||||
response = await _communicationService.QueryDeploymentStateAsync(
|
||||
siteId, query, cancellationToken);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
// Query failure (site unreachable / timeout): do NOT abort. Fall
|
||||
// through to a normal deploy; site-side stale-rejection of an older
|
||||
// revision hash is the safety net.
|
||||
_logger.LogWarning(ex,
|
||||
"Site query before re-deploy of instance {Instance} failed; " +
|
||||
"proceeding with normal deploy (site-side stale-rejection is the safety net)",
|
||||
instance.UniqueName);
|
||||
return null;
|
||||
}
|
||||
|
||||
if (response.IsDeployed &&
|
||||
string.Equals(response.AppliedRevisionHash, targetRevisionHash, StringComparison.Ordinal))
|
||||
{
|
||||
// The site already has the target revision — the prior deployment
|
||||
// actually succeeded. Reconcile the stale record instead of
|
||||
// re-sending the deploy.
|
||||
_logger.LogInformation(
|
||||
"Site already has target revision {RevisionHash} for instance {Instance}; " +
|
||||
"marking prior deployment record {DeploymentId} Success without re-deploying",
|
||||
targetRevisionHash, instance.UniqueName, prior.DeploymentId);
|
||||
|
||||
prior.Status = DeploymentStatus.Success;
|
||||
prior.ErrorMessage = null;
|
||||
prior.CompletedAt = DateTimeOffset.UtcNow;
|
||||
await _repository.UpdateDeploymentRecordAsync(prior, cancellationToken);
|
||||
await _repository.SaveChangesAsync(cancellationToken);
|
||||
|
||||
await _auditService.LogAsync(prior.DeployedBy, "DeployReconciled", "Instance",
|
||||
instance.Id.ToString(), instance.UniqueName,
|
||||
new { DeploymentId = prior.DeploymentId, RevisionHash = targetRevisionHash },
|
||||
cancellationToken);
|
||||
|
||||
return prior;
|
||||
}
|
||||
|
||||
// Site does not have the target revision (or is not deployed) — proceed
|
||||
// with the normal deploy.
|
||||
return null;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// DeploymentManager-006: the site is queried before a re-deploy only when a
|
||||
/// prior record is stuck <see cref="DeploymentStatus.InProgress"/>, or is
|
||||
/// <see cref="DeploymentStatus.Failed"/> because the site command timed out
|
||||
/// (detected via the <see cref="TimeoutFailurePrefix"/> error-message
|
||||
/// marker). All other prior states skip the query.
|
||||
/// </summary>
|
||||
private static bool ShouldQuerySiteBeforeRedeploy(DeploymentRecord prior) =>
|
||||
prior.Status == DeploymentStatus.InProgress
|
||||
|| (prior.Status == DeploymentStatus.Failed
|
||||
&& prior.ErrorMessage != null
|
||||
&& prior.ErrorMessage.StartsWith(TimeoutFailurePrefix, StringComparison.Ordinal));
|
||||
|
||||
private async Task StoreDeployedSnapshotAsync(
|
||||
int instanceId,
|
||||
string deploymentId,
|
||||
|
||||
Reference in New Issue
Block a user