fix(deployment-manager): resolve DeploymentManager-015..017 — reconciliation applies post-success side effects, updates RevisionHash, corrected XML doc

This commit is contained in:
Joseph Doherty
2026-05-17 03:18:24 -04:00
parent 14ba5495d1
commit 4fa6f0e774
3 changed files with 193 additions and 34 deletions

View File

@@ -8,7 +8,7 @@
| Last reviewed | 2026-05-17 | | Last reviewed | 2026-05-17 |
| Reviewer | claude-agent | | Reviewer | claude-agent |
| Commit reviewed | `39d737e` | | Commit reviewed | `39d737e` |
| Open findings | 3 | | Open findings | 0 |
## Summary ## Summary
@@ -740,7 +740,7 @@ covers DeploymentManager-010), `DeployToAllSitesAsync_PartialFailure_ReportsPerS
|--|--| |--|--|
| Severity | High | | Severity | High |
| Category | Correctness & logic bugs | | Category | Correctness & logic bugs |
| Status | Open | | Status | Resolved |
| Location | `src/ScadaLink.DeploymentManager/DeploymentService.cs:631-655` | | Location | `src/ScadaLink.DeploymentManager/DeploymentService.cs:631-655` |
**Description** **Description**
@@ -785,7 +785,15 @@ reconciled deployment leaves the instance `Enabled` and a snapshot stored.
**Resolution** **Resolution**
_Unresolved._ Resolved 2026-05-17 (commit pending): extracted the shared post-success side
effects into `ApplyPostSuccessSideEffectsAsync` (sets instance `State =
Enabled` + `UpdateInstanceAsync`, stores/refreshes the `DeployedConfigSnapshot`)
and invoked it from both the normal deploy success path and the
`TryReconcileWithSiteAsync` reconciled-success branch, so a reconciled
deployment now performs the same instance-state and snapshot updates as a
normal one (`configJson` is now computed before the reconciliation call and
threaded into `TryReconcileWithSiteAsync`). Regression test:
`DeployInstanceAsync_Reconciled_SetsInstanceEnabledAndStoresSnapshot`.
### DeploymentManager-016 — Reconciled prior record keeps its stale `RevisionHash` ### DeploymentManager-016 — Reconciled prior record keeps its stale `RevisionHash`
@@ -793,7 +801,7 @@ _Unresolved._
|--|--| |--|--|
| Severity | Medium | | Severity | Medium |
| Category | Correctness & logic bugs | | Category | Correctness & logic bugs |
| Status | Open | | Status | Resolved |
| Location | `src/ScadaLink.DeploymentManager/DeploymentService.cs:639-651` | | Location | `src/ScadaLink.DeploymentManager/DeploymentService.cs:639-651` |
**Description** **Description**
@@ -824,7 +832,11 @@ normal deploy.
**Resolution** **Resolution**
_Unresolved._ Resolved 2026-05-17 (commit pending): the reconciled-success branch of
`TryReconcileWithSiteAsync` now also sets `prior.RevisionHash =
targetRevisionHash`, so the persisted record, the `DeployReconciled` audit
entry, and the site's actually-applied revision all agree. Regression test:
`DeployInstanceAsync_Reconciled_PriorRecordRevisionHashUpdatedToTarget`.
### DeploymentManager-017 — `GetDeploymentStatusAsync` XML doc describes behaviour it does not implement ### DeploymentManager-017 — `GetDeploymentStatusAsync` XML doc describes behaviour it does not implement
@@ -832,7 +844,7 @@ _Unresolved._
|--|--| |--|--|
| Severity | Low | | Severity | Low |
| Category | Documentation & comments | | Category | Documentation & comments |
| Status | Open | | Status | Resolved |
| Location | `src/ScadaLink.DeploymentManager/DeploymentService.cs:562-570` | | Location | `src/ScadaLink.DeploymentManager/DeploymentService.cs:562-570` |
**Description** **Description**
@@ -855,4 +867,9 @@ configuration database" — and, if useful, cross-reference
**Resolution** **Resolution**
_Unresolved._ Resolved 2026-05-17 (commit pending): the `GetDeploymentStatusAsync` XML doc
now states it returns the persisted `DeploymentRecord` from the configuration
database as a pure local read, and cross-references `TryReconcileWithSiteAsync`
as where the query-the-site-before-redeploy reconciliation actually lives.
Documentation-only change; no regression test (a test asserting comment text
would be meaningless).

View File

@@ -142,6 +142,10 @@ public class DeploymentService
return Result<DeploymentRecord>.Failure($"Pre-deployment validation failed: {errors}"); return Result<DeploymentRecord>.Failure($"Pre-deployment validation failed: {errors}");
} }
// Serialize for transmission (also the payload stored in the deployed
// snapshot on success / reconciliation).
var configJson = JsonSerializer.Serialize(flattenedConfig);
// DeploymentManager-006: query-the-site-before-redeploy idempotency. // DeploymentManager-006: query-the-site-before-redeploy idempotency.
// If a prior deployment for this instance is stuck InProgress or Failed // If a prior deployment for this instance is stuck InProgress or Failed
// due to a timeout, the site may have actually applied the config. Query // due to a timeout, the site may have actually applied the config. Query
@@ -150,13 +154,10 @@ public class DeploymentService
// Idempotency"). A clean prior Success or a fresh first-time deploy // Idempotency"). A clean prior Success or a fresh first-time deploy
// skips this extra round-trip. // skips this extra round-trip.
var reconciled = await TryReconcileWithSiteAsync( var reconciled = await TryReconcileWithSiteAsync(
instance, revisionHash, cancellationToken); instance, revisionHash, configJson, cancellationToken);
if (reconciled != null) if (reconciled != null)
return Result<DeploymentRecord>.Success(reconciled); return Result<DeploymentRecord>.Success(reconciled);
// Serialize for transmission
var configJson = JsonSerializer.Serialize(flattenedConfig);
// WP-4: Create deployment record with Pending status // WP-4: Create deployment record with Pending status
var record = new DeploymentRecord(deploymentId, user) var record = new DeploymentRecord(deploymentId, user)
{ {
@@ -210,25 +211,8 @@ public class DeploymentService
// persistence below is best-effort: a failure here must be // persistence below is best-effort: a failure here must be
// logged loudly for operator reconciliation but must not flip // logged loudly for operator reconciliation but must not flip
// the already-committed Success record back to Failed. // the already-committed Success record back to Failed.
try await ApplyPostSuccessSideEffectsAsync(
{ instance, deploymentId, revisionHash, configJson, cancellationToken);
// WP-4: Update instance state to Enabled on successful deployment
instance.State = InstanceState.Enabled;
await _repository.UpdateInstanceAsync(instance, cancellationToken);
// WP-8: Store deployed config snapshot
await StoreDeployedSnapshotAsync(instanceId, deploymentId, revisionHash, configJson, cancellationToken);
await _repository.SaveChangesAsync(cancellationToken);
}
catch (Exception postEx)
{
_logger.LogError(postEx,
"Deployment {DeploymentId} for instance {Instance} was applied by the site and " +
"recorded Success, but post-success persistence (instance state / config snapshot) " +
"failed -- central and site state may diverge until reconciled",
deploymentId, instance.UniqueName);
}
} }
// Audit log // Audit log
@@ -560,7 +544,12 @@ public class DeploymentService
} }
/// <summary> /// <summary>
/// WP-2: After failover/timeout, query site for current deployment state before re-deploying. /// WP-2: Returns the current persisted <see cref="DeploymentRecord"/> for
/// the given deployment ID from the configuration database. This is a pure
/// local DB read — it does not contact the site. The query-the-site-before-
/// redeploy reconciliation (design: "Deployment Identity &amp; Idempotency")
/// lives in <see cref="TryReconcileWithSiteAsync"/>, which
/// <see cref="DeployInstanceAsync"/> invokes on the deploy path.
/// </summary> /// </summary>
public async Task<DeploymentRecord?> GetDeploymentStatusAsync( public async Task<DeploymentRecord?> GetDeploymentStatusAsync(
string deploymentId, string deploymentId,
@@ -580,9 +569,14 @@ public class DeploymentService
/// prior <see cref="DeploymentStatus.Success"/> skip the extra round-trip. /// prior <see cref="DeploymentStatus.Success"/> skip the extra round-trip.
/// ///
/// Reconciliation: if the site already has the TARGET revision hash, the /// Reconciliation: if the site already has the TARGET revision hash, the
/// prior record is marked <see cref="DeploymentStatus.Success"/> and /// prior record is marked <see cref="DeploymentStatus.Success"/> (with its
/// returned (the caller must NOT re-send the deploy). Otherwise <c>null</c> /// <see cref="DeploymentRecord.RevisionHash"/> corrected to the target —
/// is returned and the normal deploy proceeds. /// DeploymentManager-016) and returned (the caller must NOT re-send the
/// deploy). The same post-success side effects as the normal deploy path
/// are applied — instance <see cref="InstanceState.Enabled"/> and a stored
/// <see cref="DeployedConfigSnapshot"/> (DeploymentManager-015) — so central
/// and site state do not diverge. Otherwise <c>null</c> is returned and the
/// normal deploy proceeds.
/// ///
/// Query failure: if the site is unreachable or the query times out, this /// Query failure: if the site is unreachable or the query times out, this
/// returns <c>null</c> (fall through to a normal deploy) — site-side /// returns <c>null</c> (fall through to a normal deploy) — site-side
@@ -592,6 +586,7 @@ public class DeploymentService
private async Task<DeploymentRecord?> TryReconcileWithSiteAsync( private async Task<DeploymentRecord?> TryReconcileWithSiteAsync(
Instance instance, Instance instance,
string targetRevisionHash, string targetRevisionHash,
string configJson,
CancellationToken cancellationToken) CancellationToken cancellationToken)
{ {
var prior = await _repository.GetCurrentDeploymentStatusAsync(instance.Id, cancellationToken); var prior = await _repository.GetCurrentDeploymentStatusAsync(instance.Id, cancellationToken);
@@ -639,10 +634,23 @@ public class DeploymentService
prior.Status = DeploymentStatus.Success; prior.Status = DeploymentStatus.Success;
prior.ErrorMessage = null; prior.ErrorMessage = null;
prior.CompletedAt = DateTimeOffset.UtcNow; prior.CompletedAt = DateTimeOffset.UtcNow;
// DeploymentManager-016: the prior record can legitimately carry a
// different (stale) revision hash than the current target. The site
// confirmed it is running the target revision, so the persisted
// record, the audit entry below, and the site must all agree.
prior.RevisionHash = targetRevisionHash;
await _repository.UpdateDeploymentRecordAsync(prior, cancellationToken); await _repository.UpdateDeploymentRecordAsync(prior, cancellationToken);
await _repository.SaveChangesAsync(cancellationToken); await _repository.SaveChangesAsync(cancellationToken);
NotifyStatusChange(prior); NotifyStatusChange(prior);
// DeploymentManager-015: a reconciled deployment must perform the
// SAME post-success side effects as the normal deploy path — set
// the instance State to Enabled and store/refresh the deployed
// config snapshot — otherwise the central state machine and the
// deployed-snapshot invariant diverge from what the site is running.
await ApplyPostSuccessSideEffectsAsync(
instance, prior.DeploymentId, targetRevisionHash, configJson, cancellationToken);
await _auditService.LogAsync(prior.DeployedBy, "DeployReconciled", "Instance", await _auditService.LogAsync(prior.DeployedBy, "DeployReconciled", "Instance",
instance.Id.ToString(), instance.UniqueName, instance.Id.ToString(), instance.UniqueName,
new { DeploymentId = prior.DeploymentId, RevisionHash = targetRevisionHash }, new { DeploymentId = prior.DeploymentId, RevisionHash = targetRevisionHash },
@@ -669,6 +677,47 @@ public class DeploymentService
&& prior.ErrorMessage != null && prior.ErrorMessage != null
&& prior.ErrorMessage.StartsWith(TimeoutFailurePrefix, StringComparison.Ordinal)); && prior.ErrorMessage.StartsWith(TimeoutFailurePrefix, StringComparison.Ordinal));
/// <summary>
/// Post-success side effects shared by the normal deploy path and the
/// DeploymentManager-006 reconciliation path: set the instance
/// <see cref="InstanceState.Enabled"/> (WP-4) and store/refresh the
/// deployed config snapshot (WP-8). Factored into one helper so the two
/// paths cannot drift (DeploymentManager-015).
///
/// Best-effort: the deployment record's terminal <see cref="DeploymentStatus.Success"/>
/// status is already committed by the caller before this runs. A failure
/// here is logged loudly for operator reconciliation but is NOT propagated —
/// it must not flip the already-committed Success record back to Failed.
/// </summary>
private async Task ApplyPostSuccessSideEffectsAsync(
Instance instance,
string deploymentId,
string revisionHash,
string configJson,
CancellationToken cancellationToken)
{
try
{
// WP-4: Update instance state to Enabled on successful deployment
instance.State = InstanceState.Enabled;
await _repository.UpdateInstanceAsync(instance, cancellationToken);
// WP-8: Store deployed config snapshot
await StoreDeployedSnapshotAsync(
instance.Id, deploymentId, revisionHash, configJson, cancellationToken);
await _repository.SaveChangesAsync(cancellationToken);
}
catch (Exception postEx)
{
_logger.LogError(postEx,
"Deployment {DeploymentId} for instance {Instance} was applied by the site and " +
"recorded Success, but post-success persistence (instance state / config snapshot) " +
"failed -- central and site state may diverge until reconciled",
deploymentId, instance.UniqueName);
}
}
private async Task StoreDeployedSnapshotAsync( private async Task StoreDeployedSnapshotAsync(
int instanceId, int instanceId,
string deploymentId, string deploymentId,

View File

@@ -766,6 +766,99 @@ public class DeploymentServiceTests : TestKit
Assert.Equal(1, ReconcileProbeActor.DeployCount); Assert.Equal(1, ReconcileProbeActor.DeployCount);
} }
// ── DeploymentManager-015: reconciliation must perform the normal success side effects ──
[Fact]
public async Task DeployInstanceAsync_Reconciled_SetsInstanceEnabledAndStoresSnapshot()
{
// A prior deploy timed out; the site actually applied the target
// revision. Reconciliation marks the prior record Success WITHOUT
// re-sending -- but it must still perform the same side effects as the
// normal success path: set the instance State to Enabled and store the
// deployed-config snapshot. Otherwise central and site diverge.
var instance = new Instance("ReconcileSideEffects")
{
Id = 70, SiteId = 1, State = InstanceState.NotDeployed
};
_repo.GetInstanceByIdAsync(70, Arg.Any<CancellationToken>()).Returns(instance);
SetupValidPipeline(70, "ReconcileSideEffects", "sha256:target");
var prior = new DeploymentRecord("dep-prior-70", "admin")
{
InstanceId = 70,
Status = DeploymentStatus.InProgress,
RevisionHash = "sha256:target"
};
_repo.GetCurrentDeploymentStatusAsync(70, Arg.Any<CancellationToken>()).Returns(prior);
_repo.GetDeployedSnapshotByInstanceIdAsync(70, Arg.Any<CancellationToken>())
.Returns((DeployedConfigSnapshot?)null);
DeployedConfigSnapshot? storedSnapshot = null;
await _repo.AddDeployedSnapshotAsync(
Arg.Do<DeployedConfigSnapshot>(s => storedSnapshot = s), Arg.Any<CancellationToken>());
var commActor = Sys.ActorOf(Props.Create(() =>
new ReconcileProbeActor(siteHash: "sha256:target", failQuery: false)));
var service = CreateServiceWithCommActor(commActor);
var result = await service.DeployInstanceAsync(70, "admin");
Assert.True(result.IsSuccess);
// No re-deploy was sent -- this was reconciled.
Assert.Equal(1, ReconcileProbeActor.QueryCount);
Assert.Equal(0, ReconcileProbeActor.DeployCount);
// DeploymentManager-015: the instance State must reflect the deployed
// config the site is actually running.
Assert.Equal(InstanceState.Enabled, instance.State);
await _repo.Received().UpdateInstanceAsync(instance, Arg.Any<CancellationToken>());
// DeploymentManager-015: a deployed-config snapshot must be stored so
// GetDeploymentComparisonAsync has something to compare against.
Assert.NotNull(storedSnapshot);
Assert.Equal(70, storedSnapshot!.InstanceId);
Assert.Equal("sha256:target", storedSnapshot.RevisionHash);
}
// ── DeploymentManager-016: reconciled record must carry the target revision hash ──
[Fact]
public async Task DeployInstanceAsync_Reconciled_PriorRecordRevisionHashUpdatedToTarget()
{
// The prior record carries a stale revision hash (R1), but the site
// reports it has the freshly-flattened target revision (R2). After
// reconciliation the prior record's RevisionHash must agree with the
// site's applied revision -- not keep the stale R1.
var instance = new Instance("ReconcileStaleHash")
{
Id = 71, SiteId = 1, State = InstanceState.Enabled
};
_repo.GetInstanceByIdAsync(71, Arg.Any<CancellationToken>()).Returns(instance);
SetupValidPipeline(71, "ReconcileStaleHash", "sha256:target");
var prior = new DeploymentRecord("dep-prior-71", "admin")
{
InstanceId = 71,
Status = DeploymentStatus.InProgress,
RevisionHash = "sha256:stale-R1"
};
_repo.GetCurrentDeploymentStatusAsync(71, Arg.Any<CancellationToken>()).Returns(prior);
_repo.GetDeployedSnapshotByInstanceIdAsync(71, Arg.Any<CancellationToken>())
.Returns((DeployedConfigSnapshot?)null);
var commActor = Sys.ActorOf(Props.Create(() =>
new ReconcileProbeActor(siteHash: "sha256:target", failQuery: false)));
var service = CreateServiceWithCommActor(commActor);
var result = await service.DeployInstanceAsync(71, "admin");
Assert.True(result.IsSuccess);
Assert.Equal(DeploymentStatus.Success, prior.Status);
// DeploymentManager-016: the persisted record's RevisionHash must agree
// with the site's applied revision, not keep the stale R1.
Assert.Equal("sha256:target", prior.RevisionHash);
}
// ── DeploymentManager-012: LifecycleCommandTimeout must actually bound lifecycle commands ── // ── DeploymentManager-012: LifecycleCommandTimeout must actually bound lifecycle commands ──
[Fact] [Fact]