Files
ScadaBridge/src/ScadaLink.DeploymentManager/DeploymentService.cs
T
Joseph Doherty bc548e1447 feat(deployment-manager): resolve DeploymentManager-006 — query site deployment state before redeploy and reconcile
Adds DeploymentStateQuery request/response contracts (Commons), a site-side
handler (SiteRuntime), a CommunicationService query method (Communication), and
reconciliation in DeploymentService: when a prior record is InProgress or
Failed-on-timeout, query the site; if it already holds the target revision hash
mark the record Success without re-sending; on query failure fall through to a
normal deploy (site-side stale-rejection is the safety net).
2026-05-16 20:12:24 -04:00

560 lines
26 KiB
C#

using System.Text.Json;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using ScadaLink.Commons.Entities.Deployment;
using ScadaLink.Commons.Entities.Instances;
using ScadaLink.Commons.Interfaces.Repositories;
using ScadaLink.Commons.Interfaces.Services;
using ScadaLink.Commons.Messages.Deployment;
using ScadaLink.Commons.Messages.Lifecycle;
using ScadaLink.Commons.Types;
using ScadaLink.Commons.Types.Enums;
using ScadaLink.Commons.Types.Flattening;
using ScadaLink.Communication;
using ScadaLink.TemplateEngine.Flattening;
using ScadaLink.TemplateEngine.Validation;
namespace ScadaLink.DeploymentManager;
/// <summary>
/// WP-1: Central-side deployment orchestration service.
/// Coordinates the full deployment pipeline:
/// 1. Validate instance state transition (WP-4)
/// 2. Acquire per-instance operation lock (WP-3)
/// 3. Flatten configuration via TemplateEngine (captures template state at time of flatten -- WP-16)
/// 4. Validate flattened configuration
/// 5. Compute revision hash and diff
/// 6. Send DeployInstanceCommand to site via CommunicationService
/// 7. Track deployment status with optimistic concurrency (WP-4)
/// 8. Store deployed config snapshot (WP-8)
/// 9. Audit log all actions
///
/// WP-2: Each deployment has a unique deployment ID (GUID) + revision hash.
/// WP-16: Template state captured at flatten time -- last-write-wins on templates is safe.
/// </summary>
public class DeploymentService
{
private readonly IDeploymentManagerRepository _repository;
private readonly ISiteRepository _siteRepository;
private readonly IFlatteningPipeline _flatteningPipeline;
private readonly CommunicationService _communicationService;
private readonly OperationLockManager _lockManager;
private readonly IAuditService _auditService;
private readonly DeploymentManagerOptions _options;
private readonly ILogger<DeploymentService> _logger;
/// <summary>
/// Prefix written to <see cref="DeploymentRecord.ErrorMessage"/> when a
/// deployment fails because the site command timed out or was cancelled.
/// Used by the query-before-redeploy trigger (DeploymentManager-006) to tell
/// a timeout-induced failure apart from other deployment errors.
/// </summary>
private const string TimeoutFailurePrefix = "Communication failure:";
public DeploymentService(
IDeploymentManagerRepository repository,
ISiteRepository siteRepository,
IFlatteningPipeline flatteningPipeline,
CommunicationService communicationService,
OperationLockManager lockManager,
IAuditService auditService,
IOptions<DeploymentManagerOptions> options,
ILogger<DeploymentService> logger)
{
_repository = repository;
_siteRepository = siteRepository;
_flatteningPipeline = flatteningPipeline;
_communicationService = communicationService;
_lockManager = lockManager;
_auditService = auditService;
_options = options.Value;
_logger = logger;
}
/// <summary>
/// Resolves the site's string identifier from the numeric DB ID.
/// The communication layer routes by string identifier (e.g. "site-a"), not DB ID.
/// </summary>
private async Task<string> ResolveSiteIdentifierAsync(int siteId, CancellationToken cancellationToken)
{
var site = await _siteRepository.GetSiteByIdAsync(siteId, cancellationToken);
return site?.SiteIdentifier ?? siteId.ToString();
}
/// <summary>
/// WP-1: Deploy an instance to its site.
/// WP-2: Generates unique deployment ID, computes revision hash.
/// WP-4: Validates state transitions, uses optimistic concurrency.
/// WP-5: Site-side apply is all-or-nothing (handled by DeploymentManagerActor).
/// WP-8: Stores deployed config snapshot on success.
/// WP-16: Captures template state at time of flatten.
/// </summary>
public async Task<Result<DeploymentRecord>> DeployInstanceAsync(
int instanceId,
string user,
CancellationToken cancellationToken = default)
{
// Load instance
var instance = await _repository.GetInstanceByIdAsync(instanceId, cancellationToken);
if (instance == null)
return Result<DeploymentRecord>.Failure($"Instance with ID {instanceId} not found.");
// WP-4: Validate state transition
var transitionError = StateTransitionValidator.ValidateTransition(instance.State, "deploy");
if (transitionError != null)
return Result<DeploymentRecord>.Failure(transitionError);
// WP-3: Acquire per-instance operation lock
using var lockHandle = await _lockManager.AcquireAsync(
instance.UniqueName, _options.OperationLockTimeout, cancellationToken);
// WP-2: Generate unique deployment ID
var deploymentId = Guid.NewGuid().ToString("N");
// WP-1/16: Flatten configuration (captures template state at this point in time)
var flattenResult = await _flatteningPipeline.FlattenAndValidateAsync(instanceId, cancellationToken);
if (flattenResult.IsFailure)
return Result<DeploymentRecord>.Failure($"Validation failed: {flattenResult.Error}");
var flattenedConfig = flattenResult.Value.Configuration;
var revisionHash = flattenResult.Value.RevisionHash;
var validationResult = flattenResult.Value.Validation;
if (!validationResult.IsValid)
{
var errors = string.Join("; ", validationResult.Errors.Select(e => e.Message));
return Result<DeploymentRecord>.Failure($"Pre-deployment validation failed: {errors}");
}
// DeploymentManager-006: query-the-site-before-redeploy idempotency.
// If a prior deployment for this instance is stuck InProgress or Failed
// due to a timeout, the site may have actually applied the config. Query
// the site for its currently-applied revision before re-sending so a
// duplicate deployment is not produced (design: "Deployment Identity &
// Idempotency"). A clean prior Success or a fresh first-time deploy
// skips this extra round-trip.
var reconciled = await TryReconcileWithSiteAsync(
instance, revisionHash, cancellationToken);
if (reconciled != null)
return Result<DeploymentRecord>.Success(reconciled);
// Serialize for transmission
var configJson = JsonSerializer.Serialize(flattenedConfig);
// WP-4: Create deployment record with Pending status
var record = new DeploymentRecord(deploymentId, user)
{
InstanceId = instanceId,
Status = DeploymentStatus.Pending,
RevisionHash = revisionHash,
DeployedAt = DateTimeOffset.UtcNow
};
await _repository.AddDeploymentRecordAsync(record, cancellationToken);
await _repository.SaveChangesAsync(cancellationToken);
// Update status to InProgress
record.Status = DeploymentStatus.InProgress;
await _repository.UpdateDeploymentRecordAsync(record, cancellationToken);
await _repository.SaveChangesAsync(cancellationToken);
try
{
// WP-1: Send to site via CommunicationService
var siteId = await ResolveSiteIdentifierAsync(instance.SiteId, cancellationToken);
var command = new DeployInstanceCommand(
deploymentId, instance.UniqueName, revisionHash, configJson, user, DateTimeOffset.UtcNow);
_logger.LogInformation(
"Sending deployment {DeploymentId} for instance {Instance} to site {SiteId}",
deploymentId, instance.UniqueName, siteId);
var response = await _communicationService.DeployInstanceAsync(siteId, command, cancellationToken);
// WP-1: Update status based on site response
record.Status = response.Status;
record.ErrorMessage = response.ErrorMessage;
record.CompletedAt = DateTimeOffset.UtcNow;
await _repository.UpdateDeploymentRecordAsync(record, cancellationToken);
if (response.Status == DeploymentStatus.Success)
{
// WP-4: Update instance state to Enabled on successful deployment
instance.State = InstanceState.Enabled;
await _repository.UpdateInstanceAsync(instance, cancellationToken);
// WP-8: Store deployed config snapshot
await StoreDeployedSnapshotAsync(instanceId, deploymentId, revisionHash, configJson, cancellationToken);
}
await _repository.SaveChangesAsync(cancellationToken);
// Audit log
await _auditService.LogAsync(user, "Deploy", "Instance", instanceId.ToString(),
instance.UniqueName, new { DeploymentId = deploymentId, Status = record.Status.ToString() },
cancellationToken);
_logger.LogInformation(
"Deployment {DeploymentId} for instance {Instance}: {Status}",
deploymentId, instance.UniqueName, record.Status);
return record.Status == DeploymentStatus.Success
? Result<DeploymentRecord>.Success(record)
: Result<DeploymentRecord>.Failure(
$"Deployment failed: {response.ErrorMessage ?? "Unknown error"}");
}
catch (Exception ex)
{
// DeploymentManager-001: any exception out of the try (timeout,
// cancellation, transport, serialization, DB) must leave the
// deployment record as Failed -- the design requires an interrupted
// deployment to be treated as failed, never stuck in InProgress.
//
// DeploymentManager-002: the failure-status write must NOT use the
// operation's cancellation token. If the operation was cancelled or
// timed out, that token is already cancelled and the cleanup writes
// would themselves throw before the Failed status is persisted.
// Use CancellationToken.None so the failure is durably recorded.
var isTimeout = ex is TimeoutException or OperationCanceledException;
record.Status = DeploymentStatus.Failed;
record.ErrorMessage = isTimeout
? $"{TimeoutFailurePrefix} {ex.Message}"
: $"Deployment error: {ex.Message}";
record.CompletedAt = DateTimeOffset.UtcNow;
try
{
await _repository.UpdateDeploymentRecordAsync(record, CancellationToken.None);
await _repository.SaveChangesAsync(CancellationToken.None);
await _auditService.LogAsync(user, "DeployFailed", "Instance", instanceId.ToString(),
instance.UniqueName, new { DeploymentId = deploymentId, Error = ex.Message },
CancellationToken.None);
}
catch (Exception cleanupEx)
{
// The deployment already failed; a failed cleanup write must not
// mask the original error. Log loudly so an operator can reconcile.
_logger.LogError(cleanupEx,
"Failed to persist Failed status for deployment {DeploymentId} of instance {Instance} " +
"after deployment error: {Error}",
deploymentId, instance.UniqueName, ex.Message);
}
_logger.LogError(ex,
"Deployment {DeploymentId} for instance {Instance} failed",
deploymentId, instance.UniqueName);
return Result<DeploymentRecord>.Failure(
isTimeout
? $"Deployment timed out: {ex.Message}"
: $"Deployment failed: {ex.Message}");
}
}
/// <summary>
/// WP-6: Disable an instance. Stops Instance Actor, retains config, S&amp;F drains.
/// </summary>
public async Task<Result<InstanceLifecycleResponse>> DisableInstanceAsync(
int instanceId,
string user,
CancellationToken cancellationToken = default)
{
var instance = await _repository.GetInstanceByIdAsync(instanceId, cancellationToken);
if (instance == null)
return Result<InstanceLifecycleResponse>.Failure($"Instance with ID {instanceId} not found.");
var transitionError = StateTransitionValidator.ValidateTransition(instance.State, "disable");
if (transitionError != null)
return Result<InstanceLifecycleResponse>.Failure(transitionError);
using var lockHandle = await _lockManager.AcquireAsync(
instance.UniqueName, _options.OperationLockTimeout, cancellationToken);
var commandId = Guid.NewGuid().ToString("N");
var siteId = await ResolveSiteIdentifierAsync(instance.SiteId, cancellationToken);
var command = new DisableInstanceCommand(commandId, instance.UniqueName, DateTimeOffset.UtcNow);
var response = await _communicationService.DisableInstanceAsync(siteId, command, cancellationToken);
if (response.Success)
{
instance.State = InstanceState.Disabled;
await _repository.UpdateInstanceAsync(instance, cancellationToken);
await _repository.SaveChangesAsync(cancellationToken);
}
await _auditService.LogAsync(user, "Disable", "Instance", instanceId.ToString(),
instance.UniqueName, new { CommandId = commandId, response.Success },
cancellationToken);
return response.Success
? Result<InstanceLifecycleResponse>.Success(response)
: Result<InstanceLifecycleResponse>.Failure(response.ErrorMessage ?? "Disable failed.");
}
/// <summary>
/// WP-6: Enable an instance. Re-creates Instance Actor from stored config.
/// </summary>
public async Task<Result<InstanceLifecycleResponse>> EnableInstanceAsync(
int instanceId,
string user,
CancellationToken cancellationToken = default)
{
var instance = await _repository.GetInstanceByIdAsync(instanceId, cancellationToken);
if (instance == null)
return Result<InstanceLifecycleResponse>.Failure($"Instance with ID {instanceId} not found.");
var transitionError = StateTransitionValidator.ValidateTransition(instance.State, "enable");
if (transitionError != null)
return Result<InstanceLifecycleResponse>.Failure(transitionError);
using var lockHandle = await _lockManager.AcquireAsync(
instance.UniqueName, _options.OperationLockTimeout, cancellationToken);
var commandId = Guid.NewGuid().ToString("N");
var siteId = await ResolveSiteIdentifierAsync(instance.SiteId, cancellationToken);
var command = new EnableInstanceCommand(commandId, instance.UniqueName, DateTimeOffset.UtcNow);
var response = await _communicationService.EnableInstanceAsync(siteId, command, cancellationToken);
if (response.Success)
{
instance.State = InstanceState.Enabled;
await _repository.UpdateInstanceAsync(instance, cancellationToken);
await _repository.SaveChangesAsync(cancellationToken);
}
await _auditService.LogAsync(user, "Enable", "Instance", instanceId.ToString(),
instance.UniqueName, new { CommandId = commandId, response.Success },
cancellationToken);
return response.Success
? Result<InstanceLifecycleResponse>.Success(response)
: Result<InstanceLifecycleResponse>.Failure(response.ErrorMessage ?? "Enable failed.");
}
/// <summary>
/// WP-6: Delete an instance. Stops the site actor, removes site config, and
/// removes the central instance record (deployment history, snapshot,
/// overrides, and connection bindings go with it). S&amp;F NOT cleared.
/// Delete fails if site unreachable (30s timeout via CommunicationOptions).
/// </summary>
public async Task<Result<InstanceLifecycleResponse>> DeleteInstanceAsync(
int instanceId,
string user,
CancellationToken cancellationToken = default)
{
var instance = await _repository.GetInstanceByIdAsync(instanceId, cancellationToken);
if (instance == null)
return Result<InstanceLifecycleResponse>.Failure($"Instance with ID {instanceId} not found.");
var transitionError = StateTransitionValidator.ValidateTransition(instance.State, "delete");
if (transitionError != null)
return Result<InstanceLifecycleResponse>.Failure(transitionError);
using var lockHandle = await _lockManager.AcquireAsync(
instance.UniqueName, _options.OperationLockTimeout, cancellationToken);
var commandId = Guid.NewGuid().ToString("N");
var siteId = await ResolveSiteIdentifierAsync(instance.SiteId, cancellationToken);
var command = new DeleteInstanceCommand(commandId, instance.UniqueName, DateTimeOffset.UtcNow);
var response = await _communicationService.DeleteInstanceAsync(siteId, command, cancellationToken);
if (response.Success)
{
// Delete means delete: remove the instance record entirely.
// Deployment records, snapshot, overrides, and connection bindings
// are removed with it (see repository implementation).
await _repository.DeleteInstanceAsync(instanceId, cancellationToken);
await _repository.SaveChangesAsync(cancellationToken);
}
await _auditService.LogAsync(user, "Delete", "Instance", instanceId.ToString(),
instance.UniqueName, new { CommandId = commandId, response.Success },
cancellationToken);
return response.Success
? Result<InstanceLifecycleResponse>.Success(response)
: Result<InstanceLifecycleResponse>.Failure(
response.ErrorMessage ?? "Delete failed. Site may be unreachable.");
}
/// <summary>
/// WP-8: Get the deployed config snapshot and compare with current template-derived state.
/// </summary>
public async Task<Result<DeploymentComparisonResult>> GetDeploymentComparisonAsync(
int instanceId,
CancellationToken cancellationToken = default)
{
var snapshot = await _repository.GetDeployedSnapshotByInstanceIdAsync(instanceId, cancellationToken);
if (snapshot == null)
return Result<DeploymentComparisonResult>.Failure("No deployed snapshot found for this instance.");
// Compute current template-derived config
var currentResult = await _flatteningPipeline.FlattenAndValidateAsync(instanceId, cancellationToken);
if (currentResult.IsFailure)
return Result<DeploymentComparisonResult>.Failure($"Cannot compute current config: {currentResult.Error}");
var currentHash = currentResult.Value.RevisionHash;
var isStale = snapshot.RevisionHash != currentHash;
var result = new DeploymentComparisonResult(
instanceId,
snapshot.RevisionHash,
currentHash,
isStale,
snapshot.DeployedAt);
return Result<DeploymentComparisonResult>.Success(result);
}
/// <summary>
/// WP-2: After failover/timeout, query site for current deployment state before re-deploying.
/// </summary>
public async Task<DeploymentRecord?> GetDeploymentStatusAsync(
string deploymentId,
CancellationToken cancellationToken = default)
{
return await _repository.GetDeploymentByDeploymentIdAsync(deploymentId, cancellationToken);
}
/// <summary>
/// DeploymentManager-006: query-the-site-before-redeploy reconciliation.
///
/// The site query is issued ONLY when a prior <see cref="DeploymentRecord"/>
/// for this instance is stuck <see cref="DeploymentStatus.InProgress"/>, or
/// is <see cref="DeploymentStatus.Failed"/> due to a timeout — the only
/// cases where the site may have applied the config without central
/// learning of it. Fresh first-time deploys and redeploys after a clean
/// prior <see cref="DeploymentStatus.Success"/> skip the extra round-trip.
///
/// Reconciliation: if the site already has the TARGET revision hash, the
/// prior record is marked <see cref="DeploymentStatus.Success"/> and
/// returned (the caller must NOT re-send the deploy). Otherwise <c>null</c>
/// is returned and the normal deploy proceeds.
///
/// Query failure: if the site is unreachable or the query times out, this
/// returns <c>null</c> (fall through to a normal deploy) — site-side
/// stale-rejection of an older revision hash is the safety net. The deploy
/// is never aborted on a failed query.
/// </summary>
private async Task<DeploymentRecord?> TryReconcileWithSiteAsync(
Instance instance,
string targetRevisionHash,
CancellationToken cancellationToken)
{
var prior = await _repository.GetCurrentDeploymentStatusAsync(instance.Id, cancellationToken);
if (prior == null || !ShouldQuerySiteBeforeRedeploy(prior))
return null;
DeploymentStateQueryResponse response;
try
{
var siteId = await ResolveSiteIdentifierAsync(instance.SiteId, cancellationToken);
var query = new DeploymentStateQueryRequest(
Guid.NewGuid().ToString("N"), instance.UniqueName, DateTimeOffset.UtcNow);
_logger.LogInformation(
"Querying site {SiteId} for applied deployment state of instance {Instance} " +
"before re-deploy (prior record {DeploymentId} is {Status})",
siteId, instance.UniqueName, prior.DeploymentId, prior.Status);
response = await _communicationService.QueryDeploymentStateAsync(
siteId, query, cancellationToken);
}
catch (Exception ex)
{
// Query failure (site unreachable / timeout): do NOT abort. Fall
// through to a normal deploy; site-side stale-rejection of an older
// revision hash is the safety net.
_logger.LogWarning(ex,
"Site query before re-deploy of instance {Instance} failed; " +
"proceeding with normal deploy (site-side stale-rejection is the safety net)",
instance.UniqueName);
return null;
}
if (response.IsDeployed &&
string.Equals(response.AppliedRevisionHash, targetRevisionHash, StringComparison.Ordinal))
{
// The site already has the target revision — the prior deployment
// actually succeeded. Reconcile the stale record instead of
// re-sending the deploy.
_logger.LogInformation(
"Site already has target revision {RevisionHash} for instance {Instance}; " +
"marking prior deployment record {DeploymentId} Success without re-deploying",
targetRevisionHash, instance.UniqueName, prior.DeploymentId);
prior.Status = DeploymentStatus.Success;
prior.ErrorMessage = null;
prior.CompletedAt = DateTimeOffset.UtcNow;
await _repository.UpdateDeploymentRecordAsync(prior, cancellationToken);
await _repository.SaveChangesAsync(cancellationToken);
await _auditService.LogAsync(prior.DeployedBy, "DeployReconciled", "Instance",
instance.Id.ToString(), instance.UniqueName,
new { DeploymentId = prior.DeploymentId, RevisionHash = targetRevisionHash },
cancellationToken);
return prior;
}
// Site does not have the target revision (or is not deployed) — proceed
// with the normal deploy.
return null;
}
/// <summary>
/// DeploymentManager-006: the site is queried before a re-deploy only when a
/// prior record is stuck <see cref="DeploymentStatus.InProgress"/>, or is
/// <see cref="DeploymentStatus.Failed"/> because the site command timed out
/// (detected via the <see cref="TimeoutFailurePrefix"/> error-message
/// marker). All other prior states skip the query.
/// </summary>
private static bool ShouldQuerySiteBeforeRedeploy(DeploymentRecord prior) =>
prior.Status == DeploymentStatus.InProgress
|| (prior.Status == DeploymentStatus.Failed
&& prior.ErrorMessage != null
&& prior.ErrorMessage.StartsWith(TimeoutFailurePrefix, StringComparison.Ordinal));
private async Task StoreDeployedSnapshotAsync(
int instanceId,
string deploymentId,
string revisionHash,
string configJson,
CancellationToken cancellationToken)
{
var existing = await _repository.GetDeployedSnapshotByInstanceIdAsync(instanceId, cancellationToken);
if (existing != null)
{
existing.DeploymentId = deploymentId;
existing.RevisionHash = revisionHash;
existing.ConfigurationJson = configJson;
existing.DeployedAt = DateTimeOffset.UtcNow;
await _repository.UpdateDeployedSnapshotAsync(existing, cancellationToken);
}
else
{
var snapshot = new DeployedConfigSnapshot(deploymentId, revisionHash, configJson)
{
InstanceId = instanceId,
DeployedAt = DateTimeOffset.UtcNow
};
await _repository.AddDeployedSnapshotAsync(snapshot, cancellationToken);
}
}
}
/// <summary>
/// WP-8: Result of comparing deployed vs template-derived configuration.
/// </summary>
public record DeploymentComparisonResult(
int InstanceId,
string DeployedRevisionHash,
string CurrentRevisionHash,
bool IsStale,
DateTimeOffset DeployedAt);