fix(deployment-manager): resolve DeploymentManager-001/002 — broaden failure catch, persist failure status with non-cancellable token

This commit is contained in:
Joseph Doherty
2026-05-16 19:40:40 -04:00
parent fccd3274d3
commit ab098bf6c8
3 changed files with 149 additions and 14 deletions

View File

@@ -8,7 +8,7 @@
| Last reviewed | 2026-05-16 |
| Reviewer | claude-agent |
| Commit reviewed | `9c60592` |
| Open findings | 14 |
| Open findings | 12 |
## Summary
@@ -53,7 +53,7 @@ exercises a successful deployment or the lifecycle success paths.
|--|--|
| Severity | High |
| Category | Error handling & resilience |
| Status | Open |
| Status | Resolved |
| Location | `src/ScadaLink.DeploymentManager/DeploymentService.cs:141-199` |
**Description**
@@ -81,7 +81,13 @@ every exit path out of the `try`.
**Resolution**
_Unresolved._
Resolved 2026-05-16 (commit `<pending>`): broadened the `catch` in
`DeployInstanceAsync` to `catch (Exception ex)` so any exception (transport,
serialization, DB, `InvalidOperationException` from an uninitialized
`CommunicationService`) marks the deployment record `Failed` with the error
message and audit-logs the failure, instead of escaping and leaving the record
stuck in `InProgress`. Regression test:
`DeployInstanceAsync_CommunicationThrowsUnexpectedException_RecordMarkedFailed`.
### DeploymentManager-002 — Failure-status write uses a possibly-cancelled cancellation token
@@ -89,7 +95,7 @@ _Unresolved._
|--|--|
| Severity | High |
| Category | Error handling & resilience |
| Status | Open |
| Status | Resolved |
| Location | `src/ScadaLink.DeploymentManager/DeploymentService.cs:186-196` |
**Description**
@@ -113,7 +119,14 @@ cancelled or timed out.
**Resolution**
_Unresolved._
Resolved 2026-05-16 (commit `<pending>`): the broadened `catch` block now
performs the failure-status write (`UpdateDeploymentRecordAsync`,
`SaveChangesAsync`) and the audit `LogAsync` with `CancellationToken.None`
instead of the operation's (possibly-cancelled) token, so the `Failed` status
is durably recorded even after a timeout/cancellation. The cleanup writes are
themselves wrapped in a `try`/`catch` that logs (without masking the original
error) if persistence still fails. Regression test:
`DeployInstanceAsync_FailureWrite_UsesNonCancellableToken`.
### DeploymentManager-003 — Successful-deployment cleanup is not atomic with the status write
@@ -248,7 +261,19 @@ stale-rejection.
**Resolution**
_Unresolved._
_Unresolved._ Finding confirmed valid against the source — `GetDeploymentStatusAsync`
only reads the local `DeploymentRecord` via `GetDeploymentByDeploymentIdAsync`,
and `DeployInstanceAsync` unconditionally generates a new deployment ID with no
site reconciliation. Left Open: a proper fix is a cross-module new feature, not
a bug fix scoped to `ScadaLink.DeploymentManager`. It requires (1) a new
request/response message contract in `ScadaLink.Commons`, (2) a new
`CommunicationService` query method in `ScadaLink.Communication`, and (3)
site-side handling of the query — all outside the DeploymentManager module — plus
a design decision on the query protocol. The reconciliation logic in
`DeploymentService` cannot be implemented without those. Recommend tracking as a
dedicated cross-module feature work item (or, alternatively, amending the design
doc to delegate reconciliation entirely to site-side stale-rejection — also
outside this module's editable scope).
### DeploymentManager-007 — "Diff View" reduced to a hash comparison with no diff detail

View File

@@ -183,19 +183,53 @@ public class DeploymentService
: Result<DeploymentRecord>.Failure(
$"Deployment failed: {response.ErrorMessage ?? "Unknown error"}");
}
catch (Exception ex) when (ex is TimeoutException or OperationCanceledException)
catch (Exception ex)
{
// DeploymentManager-001: any exception out of the try (timeout,
// cancellation, transport, serialization, DB) must leave the
// deployment record as Failed -- the design requires an interrupted
// deployment to be treated as failed, never stuck in InProgress.
//
// DeploymentManager-002: the failure-status write must NOT use the
// operation's cancellation token. If the operation was cancelled or
// timed out, that token is already cancelled and the cleanup writes
// would themselves throw before the Failed status is persisted.
// Use CancellationToken.None so the failure is durably recorded.
var isTimeout = ex is TimeoutException or OperationCanceledException;
record.Status = DeploymentStatus.Failed;
record.ErrorMessage = $"Communication failure: {ex.Message}";
record.ErrorMessage = isTimeout
? $"Communication failure: {ex.Message}"
: $"Deployment error: {ex.Message}";
record.CompletedAt = DateTimeOffset.UtcNow;
await _repository.UpdateDeploymentRecordAsync(record, cancellationToken);
await _repository.SaveChangesAsync(cancellationToken);
try
{
await _repository.UpdateDeploymentRecordAsync(record, CancellationToken.None);
await _repository.SaveChangesAsync(CancellationToken.None);
await _auditService.LogAsync(user, "DeployFailed", "Instance", instanceId.ToString(),
instance.UniqueName, new { DeploymentId = deploymentId, Error = ex.Message },
cancellationToken);
CancellationToken.None);
}
catch (Exception cleanupEx)
{
// The deployment already failed; a failed cleanup write must not
// mask the original error. Log loudly so an operator can reconcile.
_logger.LogError(cleanupEx,
"Failed to persist Failed status for deployment {DeploymentId} of instance {Instance} " +
"after deployment error: {Error}",
deploymentId, instance.UniqueName, ex.Message);
}
return Result<DeploymentRecord>.Failure($"Deployment timed out: {ex.Message}");
_logger.LogError(ex,
"Deployment {DeploymentId} for instance {Instance} failed",
deploymentId, instance.UniqueName);
return Result<DeploymentRecord>.Failure(
isTimeout
? $"Deployment timed out: {ex.Message}"
: $"Deployment failed: {ex.Message}");
}
}

View File

@@ -150,6 +150,82 @@ public class DeploymentServiceTests
await _repo.Received().AddDeploymentRecordAsync(Arg.Any<DeploymentRecord>(), Arg.Any<CancellationToken>());
}
// ── DeploymentManager-001: unexpected exception must not leave record InProgress ──
[Fact]
public async Task DeployInstanceAsync_CommunicationThrowsUnexpectedException_RecordMarkedFailed()
{
var instance = new Instance("TestInst") { Id = 1, SiteId = 1, State = InstanceState.NotDeployed };
_repo.GetInstanceByIdAsync(1).Returns(instance);
var config = new FlattenedConfiguration { InstanceUniqueName = "TestInst" };
_pipeline.FlattenAndValidateAsync(1, Arg.Any<CancellationToken>())
.Returns(Result<FlatteningPipelineResult>.Success(
new FlatteningPipelineResult(config, "sha256:abc", ValidationResult.Success())));
// Capture the deployment record so we can inspect its final state.
DeploymentRecord? captured = null;
await _repo.AddDeploymentRecordAsync(
Arg.Do<DeploymentRecord>(r => captured = r), Arg.Any<CancellationToken>());
// _comms has no actor set, so DeployInstanceAsync throws
// InvalidOperationException -- a non-timeout, non-cancellation exception.
var result = await _service.DeployInstanceAsync(1, "admin");
// The exception must be handled, not escape.
Assert.True(result.IsFailure);
Assert.Contains("Deployment failed", result.Error);
// The record must not be left stuck in InProgress.
Assert.NotNull(captured);
Assert.Equal(DeploymentStatus.Failed, captured!.Status);
Assert.NotNull(captured.ErrorMessage);
Assert.NotNull(captured.CompletedAt);
}
// ── DeploymentManager-002: failure write must not use a cancelled token ──
[Fact]
public async Task DeployInstanceAsync_FailureWrite_UsesNonCancellableToken()
{
var instance = new Instance("TestInst") { Id = 1, SiteId = 1, State = InstanceState.NotDeployed };
_repo.GetInstanceByIdAsync(Arg.Any<int>(), Arg.Any<CancellationToken>()).Returns(instance);
var config = new FlattenedConfiguration { InstanceUniqueName = "TestInst" };
_pipeline.FlattenAndValidateAsync(Arg.Any<int>(), Arg.Any<CancellationToken>())
.Returns(Result<FlatteningPipelineResult>.Success(
new FlatteningPipelineResult(config, "sha256:abc", ValidationResult.Success())));
DeploymentRecord? captured = null;
await _repo.AddDeploymentRecordAsync(
Arg.Do<DeploymentRecord>(r => captured = r), Arg.Any<CancellationToken>());
// Simulate a repository that rejects already-cancelled tokens (the
// real EF Core behaviour when the operation token is cancelled). If the
// catch block passes the operation's cancelled token, the Failed-status
// write throws and the record stays InProgress -- the exact bug.
_repo.UpdateDeploymentRecordAsync(
Arg.Is<DeploymentRecord>(r => r.Status == DeploymentStatus.Failed),
Arg.Is<CancellationToken>(ct => ct.IsCancellationRequested))
.Returns<Task>(_ => throw new OperationCanceledException());
_repo.SaveChangesAsync(Arg.Is<CancellationToken>(ct => ct.IsCancellationRequested))
.Returns<Task<int>>(_ => throw new OperationCanceledException());
// The communication call fails (no actor set). The catch block must
// persist the Failed status with a non-cancellable token, so cleanup
// succeeds even when the caller's token is cancelled.
var result = await _service.DeployInstanceAsync(1, "admin");
Assert.True(result.IsFailure);
Assert.NotNull(captured);
Assert.Equal(DeploymentStatus.Failed, captured!.Status);
// The Failed-status write happened with a non-cancelled token.
await _repo.Received().UpdateDeploymentRecordAsync(
Arg.Is<DeploymentRecord>(r => r.Status == DeploymentStatus.Failed),
Arg.Is<CancellationToken>(ct => !ct.IsCancellationRequested));
}
// ── WP-6: Lifecycle commands ──
[Fact]