fix(deployment-manager): resolve DeploymentManager-001/002 — broaden failure catch, persist failure status with non-cancellable token

This commit is contained in:
Joseph Doherty
2026-05-16 19:40:40 -04:00
parent fccd3274d3
commit ab098bf6c8
3 changed files with 149 additions and 14 deletions

View File

@@ -183,19 +183,53 @@ public class DeploymentService
: Result<DeploymentRecord>.Failure(
$"Deployment failed: {response.ErrorMessage ?? "Unknown error"}");
}
catch (Exception ex) when (ex is TimeoutException or OperationCanceledException)
catch (Exception ex)
{
// DeploymentManager-001: any exception out of the try (timeout,
// cancellation, transport, serialization, DB) must leave the
// deployment record as Failed -- the design requires an interrupted
// deployment to be treated as failed, never stuck in InProgress.
//
// DeploymentManager-002: the failure-status write must NOT use the
// operation's cancellation token. If the operation was cancelled or
// timed out, that token is already cancelled and the cleanup writes
// would themselves throw before the Failed status is persisted.
// Use CancellationToken.None so the failure is durably recorded.
var isTimeout = ex is TimeoutException or OperationCanceledException;
record.Status = DeploymentStatus.Failed;
record.ErrorMessage = $"Communication failure: {ex.Message}";
record.ErrorMessage = isTimeout
? $"Communication failure: {ex.Message}"
: $"Deployment error: {ex.Message}";
record.CompletedAt = DateTimeOffset.UtcNow;
await _repository.UpdateDeploymentRecordAsync(record, cancellationToken);
await _repository.SaveChangesAsync(cancellationToken);
await _auditService.LogAsync(user, "DeployFailed", "Instance", instanceId.ToString(),
instance.UniqueName, new { DeploymentId = deploymentId, Error = ex.Message },
cancellationToken);
try
{
await _repository.UpdateDeploymentRecordAsync(record, CancellationToken.None);
await _repository.SaveChangesAsync(CancellationToken.None);
return Result<DeploymentRecord>.Failure($"Deployment timed out: {ex.Message}");
await _auditService.LogAsync(user, "DeployFailed", "Instance", instanceId.ToString(),
instance.UniqueName, new { DeploymentId = deploymentId, Error = ex.Message },
CancellationToken.None);
}
catch (Exception cleanupEx)
{
// The deployment already failed; a failed cleanup write must not
// mask the original error. Log loudly so an operator can reconcile.
_logger.LogError(cleanupEx,
"Failed to persist Failed status for deployment {DeploymentId} of instance {Instance} " +
"after deployment error: {Error}",
deploymentId, instance.UniqueName, ex.Message);
}
_logger.LogError(ex,
"Deployment {DeploymentId} for instance {Instance} failed",
deploymentId, instance.UniqueName);
return Result<DeploymentRecord>.Failure(
isTimeout
? $"Deployment timed out: {ex.Message}"
: $"Deployment failed: {ex.Message}");
}
}