using System.Text.Json; using Microsoft.Extensions.Logging; using Microsoft.Extensions.Options; using ScadaLink.Commons.Entities.Deployment; using ScadaLink.Commons.Entities.Instances; using ScadaLink.Commons.Interfaces.Repositories; using ScadaLink.Commons.Interfaces.Services; using ScadaLink.Commons.Messages.Deployment; using ScadaLink.Commons.Messages.Lifecycle; using ScadaLink.Commons.Types; using ScadaLink.Commons.Types.Enums; using ScadaLink.Commons.Types.Flattening; using ScadaLink.Communication; using ScadaLink.TemplateEngine.Flattening; using ScadaLink.TemplateEngine.Validation; namespace ScadaLink.DeploymentManager; /// /// WP-1: Central-side deployment orchestration service. /// Coordinates the full deployment pipeline: /// 1. Validate instance state transition (WP-4) /// 2. Acquire per-instance operation lock (WP-3) /// 3. Flatten configuration via TemplateEngine (captures template state at time of flatten -- WP-16) /// 4. Validate flattened configuration /// 5. Compute revision hash and diff /// 6. Send DeployInstanceCommand to site via CommunicationService /// 7. Track deployment status with optimistic concurrency (WP-4) /// 8. Store deployed config snapshot (WP-8) /// 9. Audit log all actions /// /// WP-2: Each deployment has a unique deployment ID (GUID) + revision hash. /// WP-16: Template state captured at flatten time -- last-write-wins on templates is safe. /// public class DeploymentService { private readonly IDeploymentManagerRepository _repository; private readonly ISiteRepository _siteRepository; private readonly IFlatteningPipeline _flatteningPipeline; private readonly CommunicationService _communicationService; private readonly OperationLockManager _lockManager; private readonly IAuditService _auditService; private readonly DeploymentManagerOptions _options; private readonly ILogger _logger; /// /// Prefix written to when a /// deployment fails because the site command timed out or was cancelled. /// Used by the query-before-redeploy trigger (DeploymentManager-006) to tell /// a timeout-induced failure apart from other deployment errors. /// private const string TimeoutFailurePrefix = "Communication failure:"; public DeploymentService( IDeploymentManagerRepository repository, ISiteRepository siteRepository, IFlatteningPipeline flatteningPipeline, CommunicationService communicationService, OperationLockManager lockManager, IAuditService auditService, IOptions options, ILogger logger) { _repository = repository; _siteRepository = siteRepository; _flatteningPipeline = flatteningPipeline; _communicationService = communicationService; _lockManager = lockManager; _auditService = auditService; _options = options.Value; _logger = logger; } /// /// Resolves the site's string identifier from the numeric DB ID. /// The communication layer routes by string identifier (e.g. "site-a"), not DB ID. /// private async Task ResolveSiteIdentifierAsync(int siteId, CancellationToken cancellationToken) { var site = await _siteRepository.GetSiteByIdAsync(siteId, cancellationToken); return site?.SiteIdentifier ?? siteId.ToString(); } /// /// WP-1: Deploy an instance to its site. /// WP-2: Generates unique deployment ID, computes revision hash. /// WP-4: Validates state transitions, uses optimistic concurrency. /// WP-5: Site-side apply is all-or-nothing (handled by DeploymentManagerActor). /// WP-8: Stores deployed config snapshot on success. /// WP-16: Captures template state at time of flatten. /// public async Task> DeployInstanceAsync( int instanceId, string user, CancellationToken cancellationToken = default) { // Load instance var instance = await _repository.GetInstanceByIdAsync(instanceId, cancellationToken); if (instance == null) return Result.Failure($"Instance with ID {instanceId} not found."); // WP-4: Validate state transition var transitionError = StateTransitionValidator.ValidateTransition(instance.State, "deploy"); if (transitionError != null) return Result.Failure(transitionError); // WP-3: Acquire per-instance operation lock using var lockHandle = await _lockManager.AcquireAsync( instance.UniqueName, _options.OperationLockTimeout, cancellationToken); // WP-2: Generate unique deployment ID var deploymentId = Guid.NewGuid().ToString("N"); // WP-1/16: Flatten configuration (captures template state at this point in time) var flattenResult = await _flatteningPipeline.FlattenAndValidateAsync(instanceId, cancellationToken); if (flattenResult.IsFailure) return Result.Failure($"Validation failed: {flattenResult.Error}"); var flattenedConfig = flattenResult.Value.Configuration; var revisionHash = flattenResult.Value.RevisionHash; var validationResult = flattenResult.Value.Validation; if (!validationResult.IsValid) { var errors = string.Join("; ", validationResult.Errors.Select(e => e.Message)); return Result.Failure($"Pre-deployment validation failed: {errors}"); } // DeploymentManager-006: query-the-site-before-redeploy idempotency. // If a prior deployment for this instance is stuck InProgress or Failed // due to a timeout, the site may have actually applied the config. Query // the site for its currently-applied revision before re-sending so a // duplicate deployment is not produced (design: "Deployment Identity & // Idempotency"). A clean prior Success or a fresh first-time deploy // skips this extra round-trip. var reconciled = await TryReconcileWithSiteAsync( instance, revisionHash, cancellationToken); if (reconciled != null) return Result.Success(reconciled); // Serialize for transmission var configJson = JsonSerializer.Serialize(flattenedConfig); // WP-4: Create deployment record with Pending status var record = new DeploymentRecord(deploymentId, user) { InstanceId = instanceId, Status = DeploymentStatus.Pending, RevisionHash = revisionHash, DeployedAt = DateTimeOffset.UtcNow }; await _repository.AddDeploymentRecordAsync(record, cancellationToken); await _repository.SaveChangesAsync(cancellationToken); // Update status to InProgress record.Status = DeploymentStatus.InProgress; await _repository.UpdateDeploymentRecordAsync(record, cancellationToken); await _repository.SaveChangesAsync(cancellationToken); try { // WP-1: Send to site via CommunicationService var siteId = await ResolveSiteIdentifierAsync(instance.SiteId, cancellationToken); var command = new DeployInstanceCommand( deploymentId, instance.UniqueName, revisionHash, configJson, user, DateTimeOffset.UtcNow); _logger.LogInformation( "Sending deployment {DeploymentId} for instance {Instance} to site {SiteId}", deploymentId, instance.UniqueName, siteId); var response = await _communicationService.DeployInstanceAsync(siteId, command, cancellationToken); // WP-1: Update status based on site response record.Status = response.Status; record.ErrorMessage = response.ErrorMessage; record.CompletedAt = DateTimeOffset.UtcNow; await _repository.UpdateDeploymentRecordAsync(record, cancellationToken); if (response.Status == DeploymentStatus.Success) { // WP-4: Update instance state to Enabled on successful deployment instance.State = InstanceState.Enabled; await _repository.UpdateInstanceAsync(instance, cancellationToken); // WP-8: Store deployed config snapshot await StoreDeployedSnapshotAsync(instanceId, deploymentId, revisionHash, configJson, cancellationToken); } await _repository.SaveChangesAsync(cancellationToken); // Audit log await _auditService.LogAsync(user, "Deploy", "Instance", instanceId.ToString(), instance.UniqueName, new { DeploymentId = deploymentId, Status = record.Status.ToString() }, cancellationToken); _logger.LogInformation( "Deployment {DeploymentId} for instance {Instance}: {Status}", deploymentId, instance.UniqueName, record.Status); return record.Status == DeploymentStatus.Success ? Result.Success(record) : Result.Failure( $"Deployment failed: {response.ErrorMessage ?? "Unknown error"}"); } catch (Exception ex) { // DeploymentManager-001: any exception out of the try (timeout, // cancellation, transport, serialization, DB) must leave the // deployment record as Failed -- the design requires an interrupted // deployment to be treated as failed, never stuck in InProgress. // // DeploymentManager-002: the failure-status write must NOT use the // operation's cancellation token. If the operation was cancelled or // timed out, that token is already cancelled and the cleanup writes // would themselves throw before the Failed status is persisted. // Use CancellationToken.None so the failure is durably recorded. var isTimeout = ex is TimeoutException or OperationCanceledException; record.Status = DeploymentStatus.Failed; record.ErrorMessage = isTimeout ? $"{TimeoutFailurePrefix} {ex.Message}" : $"Deployment error: {ex.Message}"; record.CompletedAt = DateTimeOffset.UtcNow; try { await _repository.UpdateDeploymentRecordAsync(record, CancellationToken.None); await _repository.SaveChangesAsync(CancellationToken.None); await _auditService.LogAsync(user, "DeployFailed", "Instance", instanceId.ToString(), instance.UniqueName, new { DeploymentId = deploymentId, Error = ex.Message }, CancellationToken.None); } catch (Exception cleanupEx) { // The deployment already failed; a failed cleanup write must not // mask the original error. Log loudly so an operator can reconcile. _logger.LogError(cleanupEx, "Failed to persist Failed status for deployment {DeploymentId} of instance {Instance} " + "after deployment error: {Error}", deploymentId, instance.UniqueName, ex.Message); } _logger.LogError(ex, "Deployment {DeploymentId} for instance {Instance} failed", deploymentId, instance.UniqueName); return Result.Failure( isTimeout ? $"Deployment timed out: {ex.Message}" : $"Deployment failed: {ex.Message}"); } } /// /// WP-6: Disable an instance. Stops Instance Actor, retains config, S&F drains. /// public async Task> DisableInstanceAsync( int instanceId, string user, CancellationToken cancellationToken = default) { var instance = await _repository.GetInstanceByIdAsync(instanceId, cancellationToken); if (instance == null) return Result.Failure($"Instance with ID {instanceId} not found."); var transitionError = StateTransitionValidator.ValidateTransition(instance.State, "disable"); if (transitionError != null) return Result.Failure(transitionError); using var lockHandle = await _lockManager.AcquireAsync( instance.UniqueName, _options.OperationLockTimeout, cancellationToken); var commandId = Guid.NewGuid().ToString("N"); var siteId = await ResolveSiteIdentifierAsync(instance.SiteId, cancellationToken); var command = new DisableInstanceCommand(commandId, instance.UniqueName, DateTimeOffset.UtcNow); var response = await _communicationService.DisableInstanceAsync(siteId, command, cancellationToken); if (response.Success) { instance.State = InstanceState.Disabled; await _repository.UpdateInstanceAsync(instance, cancellationToken); await _repository.SaveChangesAsync(cancellationToken); } await _auditService.LogAsync(user, "Disable", "Instance", instanceId.ToString(), instance.UniqueName, new { CommandId = commandId, response.Success }, cancellationToken); return response.Success ? Result.Success(response) : Result.Failure(response.ErrorMessage ?? "Disable failed."); } /// /// WP-6: Enable an instance. Re-creates Instance Actor from stored config. /// public async Task> EnableInstanceAsync( int instanceId, string user, CancellationToken cancellationToken = default) { var instance = await _repository.GetInstanceByIdAsync(instanceId, cancellationToken); if (instance == null) return Result.Failure($"Instance with ID {instanceId} not found."); var transitionError = StateTransitionValidator.ValidateTransition(instance.State, "enable"); if (transitionError != null) return Result.Failure(transitionError); using var lockHandle = await _lockManager.AcquireAsync( instance.UniqueName, _options.OperationLockTimeout, cancellationToken); var commandId = Guid.NewGuid().ToString("N"); var siteId = await ResolveSiteIdentifierAsync(instance.SiteId, cancellationToken); var command = new EnableInstanceCommand(commandId, instance.UniqueName, DateTimeOffset.UtcNow); var response = await _communicationService.EnableInstanceAsync(siteId, command, cancellationToken); if (response.Success) { instance.State = InstanceState.Enabled; await _repository.UpdateInstanceAsync(instance, cancellationToken); await _repository.SaveChangesAsync(cancellationToken); } await _auditService.LogAsync(user, "Enable", "Instance", instanceId.ToString(), instance.UniqueName, new { CommandId = commandId, response.Success }, cancellationToken); return response.Success ? Result.Success(response) : Result.Failure(response.ErrorMessage ?? "Enable failed."); } /// /// WP-6: Delete an instance. Stops the site actor, removes site config, and /// removes the central instance record (deployment history, snapshot, /// overrides, and connection bindings go with it). S&F NOT cleared. /// Delete fails if site unreachable (30s timeout via CommunicationOptions). /// public async Task> DeleteInstanceAsync( int instanceId, string user, CancellationToken cancellationToken = default) { var instance = await _repository.GetInstanceByIdAsync(instanceId, cancellationToken); if (instance == null) return Result.Failure($"Instance with ID {instanceId} not found."); var transitionError = StateTransitionValidator.ValidateTransition(instance.State, "delete"); if (transitionError != null) return Result.Failure(transitionError); using var lockHandle = await _lockManager.AcquireAsync( instance.UniqueName, _options.OperationLockTimeout, cancellationToken); var commandId = Guid.NewGuid().ToString("N"); var siteId = await ResolveSiteIdentifierAsync(instance.SiteId, cancellationToken); var command = new DeleteInstanceCommand(commandId, instance.UniqueName, DateTimeOffset.UtcNow); var response = await _communicationService.DeleteInstanceAsync(siteId, command, cancellationToken); if (response.Success) { // Delete means delete: remove the instance record entirely. // Deployment records, snapshot, overrides, and connection bindings // are removed with it (see repository implementation). await _repository.DeleteInstanceAsync(instanceId, cancellationToken); await _repository.SaveChangesAsync(cancellationToken); } await _auditService.LogAsync(user, "Delete", "Instance", instanceId.ToString(), instance.UniqueName, new { CommandId = commandId, response.Success }, cancellationToken); return response.Success ? Result.Success(response) : Result.Failure( response.ErrorMessage ?? "Delete failed. Site may be unreachable."); } /// /// WP-8: Get the deployed config snapshot and compare with current template-derived state. /// public async Task> GetDeploymentComparisonAsync( int instanceId, CancellationToken cancellationToken = default) { var snapshot = await _repository.GetDeployedSnapshotByInstanceIdAsync(instanceId, cancellationToken); if (snapshot == null) return Result.Failure("No deployed snapshot found for this instance."); // Compute current template-derived config var currentResult = await _flatteningPipeline.FlattenAndValidateAsync(instanceId, cancellationToken); if (currentResult.IsFailure) return Result.Failure($"Cannot compute current config: {currentResult.Error}"); var currentHash = currentResult.Value.RevisionHash; var isStale = snapshot.RevisionHash != currentHash; var result = new DeploymentComparisonResult( instanceId, snapshot.RevisionHash, currentHash, isStale, snapshot.DeployedAt); return Result.Success(result); } /// /// WP-2: After failover/timeout, query site for current deployment state before re-deploying. /// public async Task GetDeploymentStatusAsync( string deploymentId, CancellationToken cancellationToken = default) { return await _repository.GetDeploymentByDeploymentIdAsync(deploymentId, cancellationToken); } /// /// DeploymentManager-006: query-the-site-before-redeploy reconciliation. /// /// The site query is issued ONLY when a prior /// for this instance is stuck , or /// is due to a timeout — the only /// cases where the site may have applied the config without central /// learning of it. Fresh first-time deploys and redeploys after a clean /// prior skip the extra round-trip. /// /// Reconciliation: if the site already has the TARGET revision hash, the /// prior record is marked and /// returned (the caller must NOT re-send the deploy). Otherwise null /// is returned and the normal deploy proceeds. /// /// Query failure: if the site is unreachable or the query times out, this /// returns null (fall through to a normal deploy) — site-side /// stale-rejection of an older revision hash is the safety net. The deploy /// is never aborted on a failed query. /// private async Task TryReconcileWithSiteAsync( Instance instance, string targetRevisionHash, CancellationToken cancellationToken) { var prior = await _repository.GetCurrentDeploymentStatusAsync(instance.Id, cancellationToken); if (prior == null || !ShouldQuerySiteBeforeRedeploy(prior)) return null; DeploymentStateQueryResponse response; try { var siteId = await ResolveSiteIdentifierAsync(instance.SiteId, cancellationToken); var query = new DeploymentStateQueryRequest( Guid.NewGuid().ToString("N"), instance.UniqueName, DateTimeOffset.UtcNow); _logger.LogInformation( "Querying site {SiteId} for applied deployment state of instance {Instance} " + "before re-deploy (prior record {DeploymentId} is {Status})", siteId, instance.UniqueName, prior.DeploymentId, prior.Status); response = await _communicationService.QueryDeploymentStateAsync( siteId, query, cancellationToken); } catch (Exception ex) { // Query failure (site unreachable / timeout): do NOT abort. Fall // through to a normal deploy; site-side stale-rejection of an older // revision hash is the safety net. _logger.LogWarning(ex, "Site query before re-deploy of instance {Instance} failed; " + "proceeding with normal deploy (site-side stale-rejection is the safety net)", instance.UniqueName); return null; } if (response.IsDeployed && string.Equals(response.AppliedRevisionHash, targetRevisionHash, StringComparison.Ordinal)) { // The site already has the target revision — the prior deployment // actually succeeded. Reconcile the stale record instead of // re-sending the deploy. _logger.LogInformation( "Site already has target revision {RevisionHash} for instance {Instance}; " + "marking prior deployment record {DeploymentId} Success without re-deploying", targetRevisionHash, instance.UniqueName, prior.DeploymentId); prior.Status = DeploymentStatus.Success; prior.ErrorMessage = null; prior.CompletedAt = DateTimeOffset.UtcNow; await _repository.UpdateDeploymentRecordAsync(prior, cancellationToken); await _repository.SaveChangesAsync(cancellationToken); await _auditService.LogAsync(prior.DeployedBy, "DeployReconciled", "Instance", instance.Id.ToString(), instance.UniqueName, new { DeploymentId = prior.DeploymentId, RevisionHash = targetRevisionHash }, cancellationToken); return prior; } // Site does not have the target revision (or is not deployed) — proceed // with the normal deploy. return null; } /// /// DeploymentManager-006: the site is queried before a re-deploy only when a /// prior record is stuck , or is /// because the site command timed out /// (detected via the error-message /// marker). All other prior states skip the query. /// private static bool ShouldQuerySiteBeforeRedeploy(DeploymentRecord prior) => prior.Status == DeploymentStatus.InProgress || (prior.Status == DeploymentStatus.Failed && prior.ErrorMessage != null && prior.ErrorMessage.StartsWith(TimeoutFailurePrefix, StringComparison.Ordinal)); private async Task StoreDeployedSnapshotAsync( int instanceId, string deploymentId, string revisionHash, string configJson, CancellationToken cancellationToken) { var existing = await _repository.GetDeployedSnapshotByInstanceIdAsync(instanceId, cancellationToken); if (existing != null) { existing.DeploymentId = deploymentId; existing.RevisionHash = revisionHash; existing.ConfigurationJson = configJson; existing.DeployedAt = DateTimeOffset.UtcNow; await _repository.UpdateDeployedSnapshotAsync(existing, cancellationToken); } else { var snapshot = new DeployedConfigSnapshot(deploymentId, revisionHash, configJson) { InstanceId = instanceId, DeployedAt = DateTimeOffset.UtcNow }; await _repository.AddDeployedSnapshotAsync(snapshot, cancellationToken); } } } /// /// WP-8: Result of comparing deployed vs template-derived configuration. /// public record DeploymentComparisonResult( int InstanceId, string DeployedRevisionHash, string CurrentRevisionHash, bool IsStale, DateTimeOffset DeployedAt);