From ab098bf6c843ab02a3e416da367686542e723e9f Mon Sep 17 00:00:00 2001 From: Joseph Doherty Date: Sat, 16 May 2026 19:40:40 -0400 Subject: [PATCH] =?UTF-8?q?fix(deployment-manager):=20resolve=20Deployment?= =?UTF-8?q?Manager-001/002=20=E2=80=94=20broaden=20failure=20catch,=20pers?= =?UTF-8?q?ist=20failure=20status=20with=20non-cancellable=20token?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- code-reviews/DeploymentManager/findings.md | 37 +++++++-- .../DeploymentService.cs | 50 ++++++++++-- .../DeploymentServiceTests.cs | 76 +++++++++++++++++++ 3 files changed, 149 insertions(+), 14 deletions(-) diff --git a/code-reviews/DeploymentManager/findings.md b/code-reviews/DeploymentManager/findings.md index 34fc601..6ba548f 100644 --- a/code-reviews/DeploymentManager/findings.md +++ b/code-reviews/DeploymentManager/findings.md @@ -8,7 +8,7 @@ | Last reviewed | 2026-05-16 | | Reviewer | claude-agent | | Commit reviewed | `9c60592` | -| Open findings | 14 | +| Open findings | 12 | ## Summary @@ -53,7 +53,7 @@ exercises a successful deployment or the lifecycle success paths. |--|--| | Severity | High | | Category | Error handling & resilience | -| Status | Open | +| Status | Resolved | | Location | `src/ScadaLink.DeploymentManager/DeploymentService.cs:141-199` | **Description** @@ -81,7 +81,13 @@ every exit path out of the `try`. **Resolution** -_Unresolved._ +Resolved 2026-05-16 (commit ``): broadened the `catch` in +`DeployInstanceAsync` to `catch (Exception ex)` so any exception (transport, +serialization, DB, `InvalidOperationException` from an uninitialized +`CommunicationService`) marks the deployment record `Failed` with the error +message and audit-logs the failure, instead of escaping and leaving the record +stuck in `InProgress`. Regression test: +`DeployInstanceAsync_CommunicationThrowsUnexpectedException_RecordMarkedFailed`. ### DeploymentManager-002 — Failure-status write uses a possibly-cancelled cancellation token @@ -89,7 +95,7 @@ _Unresolved._ |--|--| | Severity | High | | Category | Error handling & resilience | -| Status | Open | +| Status | Resolved | | Location | `src/ScadaLink.DeploymentManager/DeploymentService.cs:186-196` | **Description** @@ -113,7 +119,14 @@ cancelled or timed out. **Resolution** -_Unresolved._ +Resolved 2026-05-16 (commit ``): the broadened `catch` block now +performs the failure-status write (`UpdateDeploymentRecordAsync`, +`SaveChangesAsync`) and the audit `LogAsync` with `CancellationToken.None` +instead of the operation's (possibly-cancelled) token, so the `Failed` status +is durably recorded even after a timeout/cancellation. The cleanup writes are +themselves wrapped in a `try`/`catch` that logs (without masking the original +error) if persistence still fails. Regression test: +`DeployInstanceAsync_FailureWrite_UsesNonCancellableToken`. ### DeploymentManager-003 — Successful-deployment cleanup is not atomic with the status write @@ -248,7 +261,19 @@ stale-rejection. **Resolution** -_Unresolved._ +_Unresolved._ Finding confirmed valid against the source — `GetDeploymentStatusAsync` +only reads the local `DeploymentRecord` via `GetDeploymentByDeploymentIdAsync`, +and `DeployInstanceAsync` unconditionally generates a new deployment ID with no +site reconciliation. Left Open: a proper fix is a cross-module new feature, not +a bug fix scoped to `ScadaLink.DeploymentManager`. It requires (1) a new +request/response message contract in `ScadaLink.Commons`, (2) a new +`CommunicationService` query method in `ScadaLink.Communication`, and (3) +site-side handling of the query — all outside the DeploymentManager module — plus +a design decision on the query protocol. The reconciliation logic in +`DeploymentService` cannot be implemented without those. Recommend tracking as a +dedicated cross-module feature work item (or, alternatively, amending the design +doc to delegate reconciliation entirely to site-side stale-rejection — also +outside this module's editable scope). ### DeploymentManager-007 — "Diff View" reduced to a hash comparison with no diff detail diff --git a/src/ScadaLink.DeploymentManager/DeploymentService.cs b/src/ScadaLink.DeploymentManager/DeploymentService.cs index aaa2bde..683bacf 100644 --- a/src/ScadaLink.DeploymentManager/DeploymentService.cs +++ b/src/ScadaLink.DeploymentManager/DeploymentService.cs @@ -183,19 +183,53 @@ public class DeploymentService : Result.Failure( $"Deployment failed: {response.ErrorMessage ?? "Unknown error"}"); } - catch (Exception ex) when (ex is TimeoutException or OperationCanceledException) + catch (Exception ex) { + // DeploymentManager-001: any exception out of the try (timeout, + // cancellation, transport, serialization, DB) must leave the + // deployment record as Failed -- the design requires an interrupted + // deployment to be treated as failed, never stuck in InProgress. + // + // DeploymentManager-002: the failure-status write must NOT use the + // operation's cancellation token. If the operation was cancelled or + // timed out, that token is already cancelled and the cleanup writes + // would themselves throw before the Failed status is persisted. + // Use CancellationToken.None so the failure is durably recorded. + var isTimeout = ex is TimeoutException or OperationCanceledException; + record.Status = DeploymentStatus.Failed; - record.ErrorMessage = $"Communication failure: {ex.Message}"; + record.ErrorMessage = isTimeout + ? $"Communication failure: {ex.Message}" + : $"Deployment error: {ex.Message}"; record.CompletedAt = DateTimeOffset.UtcNow; - await _repository.UpdateDeploymentRecordAsync(record, cancellationToken); - await _repository.SaveChangesAsync(cancellationToken); - await _auditService.LogAsync(user, "DeployFailed", "Instance", instanceId.ToString(), - instance.UniqueName, new { DeploymentId = deploymentId, Error = ex.Message }, - cancellationToken); + try + { + await _repository.UpdateDeploymentRecordAsync(record, CancellationToken.None); + await _repository.SaveChangesAsync(CancellationToken.None); - return Result.Failure($"Deployment timed out: {ex.Message}"); + await _auditService.LogAsync(user, "DeployFailed", "Instance", instanceId.ToString(), + instance.UniqueName, new { DeploymentId = deploymentId, Error = ex.Message }, + CancellationToken.None); + } + catch (Exception cleanupEx) + { + // The deployment already failed; a failed cleanup write must not + // mask the original error. Log loudly so an operator can reconcile. + _logger.LogError(cleanupEx, + "Failed to persist Failed status for deployment {DeploymentId} of instance {Instance} " + + "after deployment error: {Error}", + deploymentId, instance.UniqueName, ex.Message); + } + + _logger.LogError(ex, + "Deployment {DeploymentId} for instance {Instance} failed", + deploymentId, instance.UniqueName); + + return Result.Failure( + isTimeout + ? $"Deployment timed out: {ex.Message}" + : $"Deployment failed: {ex.Message}"); } } diff --git a/tests/ScadaLink.DeploymentManager.Tests/DeploymentServiceTests.cs b/tests/ScadaLink.DeploymentManager.Tests/DeploymentServiceTests.cs index 4b6401e..3adbbd6 100644 --- a/tests/ScadaLink.DeploymentManager.Tests/DeploymentServiceTests.cs +++ b/tests/ScadaLink.DeploymentManager.Tests/DeploymentServiceTests.cs @@ -150,6 +150,82 @@ public class DeploymentServiceTests await _repo.Received().AddDeploymentRecordAsync(Arg.Any(), Arg.Any()); } + // ── DeploymentManager-001: unexpected exception must not leave record InProgress ── + + [Fact] + public async Task DeployInstanceAsync_CommunicationThrowsUnexpectedException_RecordMarkedFailed() + { + var instance = new Instance("TestInst") { Id = 1, SiteId = 1, State = InstanceState.NotDeployed }; + _repo.GetInstanceByIdAsync(1).Returns(instance); + + var config = new FlattenedConfiguration { InstanceUniqueName = "TestInst" }; + _pipeline.FlattenAndValidateAsync(1, Arg.Any()) + .Returns(Result.Success( + new FlatteningPipelineResult(config, "sha256:abc", ValidationResult.Success()))); + + // Capture the deployment record so we can inspect its final state. + DeploymentRecord? captured = null; + await _repo.AddDeploymentRecordAsync( + Arg.Do(r => captured = r), Arg.Any()); + + // _comms has no actor set, so DeployInstanceAsync throws + // InvalidOperationException -- a non-timeout, non-cancellation exception. + var result = await _service.DeployInstanceAsync(1, "admin"); + + // The exception must be handled, not escape. + Assert.True(result.IsFailure); + Assert.Contains("Deployment failed", result.Error); + + // The record must not be left stuck in InProgress. + Assert.NotNull(captured); + Assert.Equal(DeploymentStatus.Failed, captured!.Status); + Assert.NotNull(captured.ErrorMessage); + Assert.NotNull(captured.CompletedAt); + } + + // ── DeploymentManager-002: failure write must not use a cancelled token ── + + [Fact] + public async Task DeployInstanceAsync_FailureWrite_UsesNonCancellableToken() + { + var instance = new Instance("TestInst") { Id = 1, SiteId = 1, State = InstanceState.NotDeployed }; + _repo.GetInstanceByIdAsync(Arg.Any(), Arg.Any()).Returns(instance); + + var config = new FlattenedConfiguration { InstanceUniqueName = "TestInst" }; + _pipeline.FlattenAndValidateAsync(Arg.Any(), Arg.Any()) + .Returns(Result.Success( + new FlatteningPipelineResult(config, "sha256:abc", ValidationResult.Success()))); + + DeploymentRecord? captured = null; + await _repo.AddDeploymentRecordAsync( + Arg.Do(r => captured = r), Arg.Any()); + + // Simulate a repository that rejects already-cancelled tokens (the + // real EF Core behaviour when the operation token is cancelled). If the + // catch block passes the operation's cancelled token, the Failed-status + // write throws and the record stays InProgress -- the exact bug. + _repo.UpdateDeploymentRecordAsync( + Arg.Is(r => r.Status == DeploymentStatus.Failed), + Arg.Is(ct => ct.IsCancellationRequested)) + .Returns(_ => throw new OperationCanceledException()); + _repo.SaveChangesAsync(Arg.Is(ct => ct.IsCancellationRequested)) + .Returns>(_ => throw new OperationCanceledException()); + + // The communication call fails (no actor set). The catch block must + // persist the Failed status with a non-cancellable token, so cleanup + // succeeds even when the caller's token is cancelled. + var result = await _service.DeployInstanceAsync(1, "admin"); + + Assert.True(result.IsFailure); + Assert.NotNull(captured); + Assert.Equal(DeploymentStatus.Failed, captured!.Status); + + // The Failed-status write happened with a non-cancelled token. + await _repo.Received().UpdateDeploymentRecordAsync( + Arg.Is(r => r.Status == DeploymentStatus.Failed), + Arg.Is(ct => !ct.IsCancellationRequested)); + } + // ── WP-6: Lifecycle commands ── [Fact]