fix(deployment-manager): resolve DeploymentManager-001/002 — broaden failure catch, persist failure status with non-cancellable token
This commit is contained in:
@@ -183,19 +183,53 @@ public class DeploymentService
|
||||
: Result<DeploymentRecord>.Failure(
|
||||
$"Deployment failed: {response.ErrorMessage ?? "Unknown error"}");
|
||||
}
|
||||
catch (Exception ex) when (ex is TimeoutException or OperationCanceledException)
|
||||
catch (Exception ex)
|
||||
{
|
||||
// DeploymentManager-001: any exception out of the try (timeout,
|
||||
// cancellation, transport, serialization, DB) must leave the
|
||||
// deployment record as Failed -- the design requires an interrupted
|
||||
// deployment to be treated as failed, never stuck in InProgress.
|
||||
//
|
||||
// DeploymentManager-002: the failure-status write must NOT use the
|
||||
// operation's cancellation token. If the operation was cancelled or
|
||||
// timed out, that token is already cancelled and the cleanup writes
|
||||
// would themselves throw before the Failed status is persisted.
|
||||
// Use CancellationToken.None so the failure is durably recorded.
|
||||
var isTimeout = ex is TimeoutException or OperationCanceledException;
|
||||
|
||||
record.Status = DeploymentStatus.Failed;
|
||||
record.ErrorMessage = $"Communication failure: {ex.Message}";
|
||||
record.ErrorMessage = isTimeout
|
||||
? $"Communication failure: {ex.Message}"
|
||||
: $"Deployment error: {ex.Message}";
|
||||
record.CompletedAt = DateTimeOffset.UtcNow;
|
||||
await _repository.UpdateDeploymentRecordAsync(record, cancellationToken);
|
||||
await _repository.SaveChangesAsync(cancellationToken);
|
||||
|
||||
await _auditService.LogAsync(user, "DeployFailed", "Instance", instanceId.ToString(),
|
||||
instance.UniqueName, new { DeploymentId = deploymentId, Error = ex.Message },
|
||||
cancellationToken);
|
||||
try
|
||||
{
|
||||
await _repository.UpdateDeploymentRecordAsync(record, CancellationToken.None);
|
||||
await _repository.SaveChangesAsync(CancellationToken.None);
|
||||
|
||||
return Result<DeploymentRecord>.Failure($"Deployment timed out: {ex.Message}");
|
||||
await _auditService.LogAsync(user, "DeployFailed", "Instance", instanceId.ToString(),
|
||||
instance.UniqueName, new { DeploymentId = deploymentId, Error = ex.Message },
|
||||
CancellationToken.None);
|
||||
}
|
||||
catch (Exception cleanupEx)
|
||||
{
|
||||
// The deployment already failed; a failed cleanup write must not
|
||||
// mask the original error. Log loudly so an operator can reconcile.
|
||||
_logger.LogError(cleanupEx,
|
||||
"Failed to persist Failed status for deployment {DeploymentId} of instance {Instance} " +
|
||||
"after deployment error: {Error}",
|
||||
deploymentId, instance.UniqueName, ex.Message);
|
||||
}
|
||||
|
||||
_logger.LogError(ex,
|
||||
"Deployment {DeploymentId} for instance {Instance} failed",
|
||||
deploymentId, instance.UniqueName);
|
||||
|
||||
return Result<DeploymentRecord>.Failure(
|
||||
isTimeout
|
||||
? $"Deployment timed out: {ex.Message}"
|
||||
: $"Deployment failed: {ex.Message}");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user