fix(deployment-manager): resolve DeploymentManager-001/002 — broaden failure catch, persist failure status with non-cancellable token
This commit is contained in:
@@ -8,7 +8,7 @@
|
|||||||
| Last reviewed | 2026-05-16 |
|
| Last reviewed | 2026-05-16 |
|
||||||
| Reviewer | claude-agent |
|
| Reviewer | claude-agent |
|
||||||
| Commit reviewed | `9c60592` |
|
| Commit reviewed | `9c60592` |
|
||||||
| Open findings | 14 |
|
| Open findings | 12 |
|
||||||
|
|
||||||
## Summary
|
## Summary
|
||||||
|
|
||||||
@@ -53,7 +53,7 @@ exercises a successful deployment or the lifecycle success paths.
|
|||||||
|--|--|
|
|--|--|
|
||||||
| Severity | High |
|
| Severity | High |
|
||||||
| Category | Error handling & resilience |
|
| Category | Error handling & resilience |
|
||||||
| Status | Open |
|
| Status | Resolved |
|
||||||
| Location | `src/ScadaLink.DeploymentManager/DeploymentService.cs:141-199` |
|
| Location | `src/ScadaLink.DeploymentManager/DeploymentService.cs:141-199` |
|
||||||
|
|
||||||
**Description**
|
**Description**
|
||||||
@@ -81,7 +81,13 @@ every exit path out of the `try`.
|
|||||||
|
|
||||||
**Resolution**
|
**Resolution**
|
||||||
|
|
||||||
_Unresolved._
|
Resolved 2026-05-16 (commit `<pending>`): broadened the `catch` in
|
||||||
|
`DeployInstanceAsync` to `catch (Exception ex)` so any exception (transport,
|
||||||
|
serialization, DB, `InvalidOperationException` from an uninitialized
|
||||||
|
`CommunicationService`) marks the deployment record `Failed` with the error
|
||||||
|
message and audit-logs the failure, instead of escaping and leaving the record
|
||||||
|
stuck in `InProgress`. Regression test:
|
||||||
|
`DeployInstanceAsync_CommunicationThrowsUnexpectedException_RecordMarkedFailed`.
|
||||||
|
|
||||||
### DeploymentManager-002 — Failure-status write uses a possibly-cancelled cancellation token
|
### DeploymentManager-002 — Failure-status write uses a possibly-cancelled cancellation token
|
||||||
|
|
||||||
@@ -89,7 +95,7 @@ _Unresolved._
|
|||||||
|--|--|
|
|--|--|
|
||||||
| Severity | High |
|
| Severity | High |
|
||||||
| Category | Error handling & resilience |
|
| Category | Error handling & resilience |
|
||||||
| Status | Open |
|
| Status | Resolved |
|
||||||
| Location | `src/ScadaLink.DeploymentManager/DeploymentService.cs:186-196` |
|
| Location | `src/ScadaLink.DeploymentManager/DeploymentService.cs:186-196` |
|
||||||
|
|
||||||
**Description**
|
**Description**
|
||||||
@@ -113,7 +119,14 @@ cancelled or timed out.
|
|||||||
|
|
||||||
**Resolution**
|
**Resolution**
|
||||||
|
|
||||||
_Unresolved._
|
Resolved 2026-05-16 (commit `<pending>`): the broadened `catch` block now
|
||||||
|
performs the failure-status write (`UpdateDeploymentRecordAsync`,
|
||||||
|
`SaveChangesAsync`) and the audit `LogAsync` with `CancellationToken.None`
|
||||||
|
instead of the operation's (possibly-cancelled) token, so the `Failed` status
|
||||||
|
is durably recorded even after a timeout/cancellation. The cleanup writes are
|
||||||
|
themselves wrapped in a `try`/`catch` that logs (without masking the original
|
||||||
|
error) if persistence still fails. Regression test:
|
||||||
|
`DeployInstanceAsync_FailureWrite_UsesNonCancellableToken`.
|
||||||
|
|
||||||
### DeploymentManager-003 — Successful-deployment cleanup is not atomic with the status write
|
### DeploymentManager-003 — Successful-deployment cleanup is not atomic with the status write
|
||||||
|
|
||||||
@@ -248,7 +261,19 @@ stale-rejection.
|
|||||||
|
|
||||||
**Resolution**
|
**Resolution**
|
||||||
|
|
||||||
_Unresolved._
|
_Unresolved._ Finding confirmed valid against the source — `GetDeploymentStatusAsync`
|
||||||
|
only reads the local `DeploymentRecord` via `GetDeploymentByDeploymentIdAsync`,
|
||||||
|
and `DeployInstanceAsync` unconditionally generates a new deployment ID with no
|
||||||
|
site reconciliation. Left Open: a proper fix is a cross-module new feature, not
|
||||||
|
a bug fix scoped to `ScadaLink.DeploymentManager`. It requires (1) a new
|
||||||
|
request/response message contract in `ScadaLink.Commons`, (2) a new
|
||||||
|
`CommunicationService` query method in `ScadaLink.Communication`, and (3)
|
||||||
|
site-side handling of the query — all outside the DeploymentManager module — plus
|
||||||
|
a design decision on the query protocol. The reconciliation logic in
|
||||||
|
`DeploymentService` cannot be implemented without those. Recommend tracking as a
|
||||||
|
dedicated cross-module feature work item (or, alternatively, amending the design
|
||||||
|
doc to delegate reconciliation entirely to site-side stale-rejection — also
|
||||||
|
outside this module's editable scope).
|
||||||
|
|
||||||
### DeploymentManager-007 — "Diff View" reduced to a hash comparison with no diff detail
|
### DeploymentManager-007 — "Diff View" reduced to a hash comparison with no diff detail
|
||||||
|
|
||||||
|
|||||||
@@ -183,19 +183,53 @@ public class DeploymentService
|
|||||||
: Result<DeploymentRecord>.Failure(
|
: Result<DeploymentRecord>.Failure(
|
||||||
$"Deployment failed: {response.ErrorMessage ?? "Unknown error"}");
|
$"Deployment failed: {response.ErrorMessage ?? "Unknown error"}");
|
||||||
}
|
}
|
||||||
catch (Exception ex) when (ex is TimeoutException or OperationCanceledException)
|
catch (Exception ex)
|
||||||
{
|
{
|
||||||
|
// DeploymentManager-001: any exception out of the try (timeout,
|
||||||
|
// cancellation, transport, serialization, DB) must leave the
|
||||||
|
// deployment record as Failed -- the design requires an interrupted
|
||||||
|
// deployment to be treated as failed, never stuck in InProgress.
|
||||||
|
//
|
||||||
|
// DeploymentManager-002: the failure-status write must NOT use the
|
||||||
|
// operation's cancellation token. If the operation was cancelled or
|
||||||
|
// timed out, that token is already cancelled and the cleanup writes
|
||||||
|
// would themselves throw before the Failed status is persisted.
|
||||||
|
// Use CancellationToken.None so the failure is durably recorded.
|
||||||
|
var isTimeout = ex is TimeoutException or OperationCanceledException;
|
||||||
|
|
||||||
record.Status = DeploymentStatus.Failed;
|
record.Status = DeploymentStatus.Failed;
|
||||||
record.ErrorMessage = $"Communication failure: {ex.Message}";
|
record.ErrorMessage = isTimeout
|
||||||
|
? $"Communication failure: {ex.Message}"
|
||||||
|
: $"Deployment error: {ex.Message}";
|
||||||
record.CompletedAt = DateTimeOffset.UtcNow;
|
record.CompletedAt = DateTimeOffset.UtcNow;
|
||||||
await _repository.UpdateDeploymentRecordAsync(record, cancellationToken);
|
|
||||||
await _repository.SaveChangesAsync(cancellationToken);
|
|
||||||
|
|
||||||
await _auditService.LogAsync(user, "DeployFailed", "Instance", instanceId.ToString(),
|
try
|
||||||
instance.UniqueName, new { DeploymentId = deploymentId, Error = ex.Message },
|
{
|
||||||
cancellationToken);
|
await _repository.UpdateDeploymentRecordAsync(record, CancellationToken.None);
|
||||||
|
await _repository.SaveChangesAsync(CancellationToken.None);
|
||||||
|
|
||||||
return Result<DeploymentRecord>.Failure($"Deployment timed out: {ex.Message}");
|
await _auditService.LogAsync(user, "DeployFailed", "Instance", instanceId.ToString(),
|
||||||
|
instance.UniqueName, new { DeploymentId = deploymentId, Error = ex.Message },
|
||||||
|
CancellationToken.None);
|
||||||
|
}
|
||||||
|
catch (Exception cleanupEx)
|
||||||
|
{
|
||||||
|
// The deployment already failed; a failed cleanup write must not
|
||||||
|
// mask the original error. Log loudly so an operator can reconcile.
|
||||||
|
_logger.LogError(cleanupEx,
|
||||||
|
"Failed to persist Failed status for deployment {DeploymentId} of instance {Instance} " +
|
||||||
|
"after deployment error: {Error}",
|
||||||
|
deploymentId, instance.UniqueName, ex.Message);
|
||||||
|
}
|
||||||
|
|
||||||
|
_logger.LogError(ex,
|
||||||
|
"Deployment {DeploymentId} for instance {Instance} failed",
|
||||||
|
deploymentId, instance.UniqueName);
|
||||||
|
|
||||||
|
return Result<DeploymentRecord>.Failure(
|
||||||
|
isTimeout
|
||||||
|
? $"Deployment timed out: {ex.Message}"
|
||||||
|
: $"Deployment failed: {ex.Message}");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -150,6 +150,82 @@ public class DeploymentServiceTests
|
|||||||
await _repo.Received().AddDeploymentRecordAsync(Arg.Any<DeploymentRecord>(), Arg.Any<CancellationToken>());
|
await _repo.Received().AddDeploymentRecordAsync(Arg.Any<DeploymentRecord>(), Arg.Any<CancellationToken>());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ── DeploymentManager-001: unexpected exception must not leave record InProgress ──
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public async Task DeployInstanceAsync_CommunicationThrowsUnexpectedException_RecordMarkedFailed()
|
||||||
|
{
|
||||||
|
var instance = new Instance("TestInst") { Id = 1, SiteId = 1, State = InstanceState.NotDeployed };
|
||||||
|
_repo.GetInstanceByIdAsync(1).Returns(instance);
|
||||||
|
|
||||||
|
var config = new FlattenedConfiguration { InstanceUniqueName = "TestInst" };
|
||||||
|
_pipeline.FlattenAndValidateAsync(1, Arg.Any<CancellationToken>())
|
||||||
|
.Returns(Result<FlatteningPipelineResult>.Success(
|
||||||
|
new FlatteningPipelineResult(config, "sha256:abc", ValidationResult.Success())));
|
||||||
|
|
||||||
|
// Capture the deployment record so we can inspect its final state.
|
||||||
|
DeploymentRecord? captured = null;
|
||||||
|
await _repo.AddDeploymentRecordAsync(
|
||||||
|
Arg.Do<DeploymentRecord>(r => captured = r), Arg.Any<CancellationToken>());
|
||||||
|
|
||||||
|
// _comms has no actor set, so DeployInstanceAsync throws
|
||||||
|
// InvalidOperationException -- a non-timeout, non-cancellation exception.
|
||||||
|
var result = await _service.DeployInstanceAsync(1, "admin");
|
||||||
|
|
||||||
|
// The exception must be handled, not escape.
|
||||||
|
Assert.True(result.IsFailure);
|
||||||
|
Assert.Contains("Deployment failed", result.Error);
|
||||||
|
|
||||||
|
// The record must not be left stuck in InProgress.
|
||||||
|
Assert.NotNull(captured);
|
||||||
|
Assert.Equal(DeploymentStatus.Failed, captured!.Status);
|
||||||
|
Assert.NotNull(captured.ErrorMessage);
|
||||||
|
Assert.NotNull(captured.CompletedAt);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── DeploymentManager-002: failure write must not use a cancelled token ──
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public async Task DeployInstanceAsync_FailureWrite_UsesNonCancellableToken()
|
||||||
|
{
|
||||||
|
var instance = new Instance("TestInst") { Id = 1, SiteId = 1, State = InstanceState.NotDeployed };
|
||||||
|
_repo.GetInstanceByIdAsync(Arg.Any<int>(), Arg.Any<CancellationToken>()).Returns(instance);
|
||||||
|
|
||||||
|
var config = new FlattenedConfiguration { InstanceUniqueName = "TestInst" };
|
||||||
|
_pipeline.FlattenAndValidateAsync(Arg.Any<int>(), Arg.Any<CancellationToken>())
|
||||||
|
.Returns(Result<FlatteningPipelineResult>.Success(
|
||||||
|
new FlatteningPipelineResult(config, "sha256:abc", ValidationResult.Success())));
|
||||||
|
|
||||||
|
DeploymentRecord? captured = null;
|
||||||
|
await _repo.AddDeploymentRecordAsync(
|
||||||
|
Arg.Do<DeploymentRecord>(r => captured = r), Arg.Any<CancellationToken>());
|
||||||
|
|
||||||
|
// Simulate a repository that rejects already-cancelled tokens (the
|
||||||
|
// real EF Core behaviour when the operation token is cancelled). If the
|
||||||
|
// catch block passes the operation's cancelled token, the Failed-status
|
||||||
|
// write throws and the record stays InProgress -- the exact bug.
|
||||||
|
_repo.UpdateDeploymentRecordAsync(
|
||||||
|
Arg.Is<DeploymentRecord>(r => r.Status == DeploymentStatus.Failed),
|
||||||
|
Arg.Is<CancellationToken>(ct => ct.IsCancellationRequested))
|
||||||
|
.Returns<Task>(_ => throw new OperationCanceledException());
|
||||||
|
_repo.SaveChangesAsync(Arg.Is<CancellationToken>(ct => ct.IsCancellationRequested))
|
||||||
|
.Returns<Task<int>>(_ => throw new OperationCanceledException());
|
||||||
|
|
||||||
|
// The communication call fails (no actor set). The catch block must
|
||||||
|
// persist the Failed status with a non-cancellable token, so cleanup
|
||||||
|
// succeeds even when the caller's token is cancelled.
|
||||||
|
var result = await _service.DeployInstanceAsync(1, "admin");
|
||||||
|
|
||||||
|
Assert.True(result.IsFailure);
|
||||||
|
Assert.NotNull(captured);
|
||||||
|
Assert.Equal(DeploymentStatus.Failed, captured!.Status);
|
||||||
|
|
||||||
|
// The Failed-status write happened with a non-cancelled token.
|
||||||
|
await _repo.Received().UpdateDeploymentRecordAsync(
|
||||||
|
Arg.Is<DeploymentRecord>(r => r.Status == DeploymentStatus.Failed),
|
||||||
|
Arg.Is<CancellationToken>(ct => !ct.IsCancellationRequested));
|
||||||
|
}
|
||||||
|
|
||||||
// ── WP-6: Lifecycle commands ──
|
// ── WP-6: Lifecycle commands ──
|
||||||
|
|
||||||
[Fact]
|
[Fact]
|
||||||
|
|||||||
Reference in New Issue
Block a user