fix(deployment-manager): resolve DeploymentManager-015..017 — reconciliation applies post-success side effects, updates RevisionHash, corrected XML doc

This commit is contained in:
Joseph Doherty
2026-05-17 03:18:24 -04:00
parent 14ba5495d1
commit 4fa6f0e774
3 changed files with 193 additions and 34 deletions

View File

@@ -142,6 +142,10 @@ public class DeploymentService
return Result<DeploymentRecord>.Failure($"Pre-deployment validation failed: {errors}");
}
// Serialize for transmission (also the payload stored in the deployed
// snapshot on success / reconciliation).
var configJson = JsonSerializer.Serialize(flattenedConfig);
// DeploymentManager-006: query-the-site-before-redeploy idempotency.
// If a prior deployment for this instance is stuck InProgress or Failed
// due to a timeout, the site may have actually applied the config. Query
@@ -150,13 +154,10 @@ public class DeploymentService
// Idempotency"). A clean prior Success or a fresh first-time deploy
// skips this extra round-trip.
var reconciled = await TryReconcileWithSiteAsync(
instance, revisionHash, cancellationToken);
instance, revisionHash, configJson, cancellationToken);
if (reconciled != null)
return Result<DeploymentRecord>.Success(reconciled);
// Serialize for transmission
var configJson = JsonSerializer.Serialize(flattenedConfig);
// WP-4: Create deployment record with Pending status
var record = new DeploymentRecord(deploymentId, user)
{
@@ -210,25 +211,8 @@ public class DeploymentService
// persistence below is best-effort: a failure here must be
// logged loudly for operator reconciliation but must not flip
// the already-committed Success record back to Failed.
try
{
// WP-4: Update instance state to Enabled on successful deployment
instance.State = InstanceState.Enabled;
await _repository.UpdateInstanceAsync(instance, cancellationToken);
// WP-8: Store deployed config snapshot
await StoreDeployedSnapshotAsync(instanceId, deploymentId, revisionHash, configJson, cancellationToken);
await _repository.SaveChangesAsync(cancellationToken);
}
catch (Exception postEx)
{
_logger.LogError(postEx,
"Deployment {DeploymentId} for instance {Instance} was applied by the site and " +
"recorded Success, but post-success persistence (instance state / config snapshot) " +
"failed -- central and site state may diverge until reconciled",
deploymentId, instance.UniqueName);
}
await ApplyPostSuccessSideEffectsAsync(
instance, deploymentId, revisionHash, configJson, cancellationToken);
}
// Audit log
@@ -560,7 +544,12 @@ public class DeploymentService
}
/// <summary>
/// WP-2: After failover/timeout, query site for current deployment state before re-deploying.
/// WP-2: Returns the current persisted <see cref="DeploymentRecord"/> for
/// the given deployment ID from the configuration database. This is a pure
/// local DB read — it does not contact the site. The query-the-site-before-
/// redeploy reconciliation (design: "Deployment Identity &amp; Idempotency")
/// lives in <see cref="TryReconcileWithSiteAsync"/>, which
/// <see cref="DeployInstanceAsync"/> invokes on the deploy path.
/// </summary>
public async Task<DeploymentRecord?> GetDeploymentStatusAsync(
string deploymentId,
@@ -580,9 +569,14 @@ public class DeploymentService
/// prior <see cref="DeploymentStatus.Success"/> skip the extra round-trip.
///
/// Reconciliation: if the site already has the TARGET revision hash, the
/// prior record is marked <see cref="DeploymentStatus.Success"/> and
/// returned (the caller must NOT re-send the deploy). Otherwise <c>null</c>
/// is returned and the normal deploy proceeds.
/// prior record is marked <see cref="DeploymentStatus.Success"/> (with its
/// <see cref="DeploymentRecord.RevisionHash"/> corrected to the target —
/// DeploymentManager-016) and returned (the caller must NOT re-send the
/// deploy). The same post-success side effects as the normal deploy path
/// are applied — instance <see cref="InstanceState.Enabled"/> and a stored
/// <see cref="DeployedConfigSnapshot"/> (DeploymentManager-015) — so central
/// and site state do not diverge. Otherwise <c>null</c> is returned and the
/// normal deploy proceeds.
///
/// Query failure: if the site is unreachable or the query times out, this
/// returns <c>null</c> (fall through to a normal deploy) — site-side
@@ -592,6 +586,7 @@ public class DeploymentService
private async Task<DeploymentRecord?> TryReconcileWithSiteAsync(
Instance instance,
string targetRevisionHash,
string configJson,
CancellationToken cancellationToken)
{
var prior = await _repository.GetCurrentDeploymentStatusAsync(instance.Id, cancellationToken);
@@ -639,10 +634,23 @@ public class DeploymentService
prior.Status = DeploymentStatus.Success;
prior.ErrorMessage = null;
prior.CompletedAt = DateTimeOffset.UtcNow;
// DeploymentManager-016: the prior record can legitimately carry a
// different (stale) revision hash than the current target. The site
// confirmed it is running the target revision, so the persisted
// record, the audit entry below, and the site must all agree.
prior.RevisionHash = targetRevisionHash;
await _repository.UpdateDeploymentRecordAsync(prior, cancellationToken);
await _repository.SaveChangesAsync(cancellationToken);
NotifyStatusChange(prior);
// DeploymentManager-015: a reconciled deployment must perform the
// SAME post-success side effects as the normal deploy path — set
// the instance State to Enabled and store/refresh the deployed
// config snapshot — otherwise the central state machine and the
// deployed-snapshot invariant diverge from what the site is running.
await ApplyPostSuccessSideEffectsAsync(
instance, prior.DeploymentId, targetRevisionHash, configJson, cancellationToken);
await _auditService.LogAsync(prior.DeployedBy, "DeployReconciled", "Instance",
instance.Id.ToString(), instance.UniqueName,
new { DeploymentId = prior.DeploymentId, RevisionHash = targetRevisionHash },
@@ -669,6 +677,47 @@ public class DeploymentService
&& prior.ErrorMessage != null
&& prior.ErrorMessage.StartsWith(TimeoutFailurePrefix, StringComparison.Ordinal));
/// <summary>
/// Post-success side effects shared by the normal deploy path and the
/// DeploymentManager-006 reconciliation path: set the instance
/// <see cref="InstanceState.Enabled"/> (WP-4) and store/refresh the
/// deployed config snapshot (WP-8). Factored into one helper so the two
/// paths cannot drift (DeploymentManager-015).
///
/// Best-effort: the deployment record's terminal <see cref="DeploymentStatus.Success"/>
/// status is already committed by the caller before this runs. A failure
/// here is logged loudly for operator reconciliation but is NOT propagated —
/// it must not flip the already-committed Success record back to Failed.
/// </summary>
private async Task ApplyPostSuccessSideEffectsAsync(
Instance instance,
string deploymentId,
string revisionHash,
string configJson,
CancellationToken cancellationToken)
{
try
{
// WP-4: Update instance state to Enabled on successful deployment
instance.State = InstanceState.Enabled;
await _repository.UpdateInstanceAsync(instance, cancellationToken);
// WP-8: Store deployed config snapshot
await StoreDeployedSnapshotAsync(
instance.Id, deploymentId, revisionHash, configJson, cancellationToken);
await _repository.SaveChangesAsync(cancellationToken);
}
catch (Exception postEx)
{
_logger.LogError(postEx,
"Deployment {DeploymentId} for instance {Instance} was applied by the site and " +
"recorded Success, but post-success persistence (instance state / config snapshot) " +
"failed -- central and site state may diverge until reconciled",
deploymentId, instance.UniqueName);
}
}
private async Task StoreDeployedSnapshotAsync(
int instanceId,
string deploymentId,