ScadaBridge/src/ZB.MOM.WW.ScadaBridge.DeploymentManager/DeploymentService.cs

using System.Text.Json;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using ZB.MOM.WW.ScadaBridge.Commons.Entities.Deployment;
using ZB.MOM.WW.ScadaBridge.Commons.Entities.Instances;
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Repositories;
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Services;
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Deployment;
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Lifecycle;
using ZB.MOM.WW.ScadaBridge.Commons.Observability;
using ZB.MOM.WW.ScadaBridge.Commons.Types;
using ZB.MOM.WW.ScadaBridge.Commons.Types.Enums;
using ZB.MOM.WW.ScadaBridge.Commons.Types.Flattening;
using ZB.MOM.WW.ScadaBridge.Communication;
using ZB.MOM.WW.ScadaBridge.TemplateEngine.Flattening;
using ZB.MOM.WW.ScadaBridge.TemplateEngine.Validation;

namespace ZB.MOM.WW.ScadaBridge.DeploymentManager;

/// <summary>
/// WP-1: Central-side deployment orchestration service.
/// Coordinates the full deployment pipeline:
///   1. Validate instance state transition (WP-4)
///   2. Acquire per-instance operation lock (WP-3)
///   3. Flatten configuration via TemplateEngine (captures template state at time of flatten -- WP-16)
///   4. Validate flattened configuration
///   5. Compute revision hash and diff
///   6. Send DeployInstanceCommand to site via CommunicationService
///   7. Track deployment status with optimistic concurrency (WP-4)
///   8. Store deployed config snapshot (WP-8)
///   9. Audit log all actions
///
/// WP-2: Each deployment has a unique deployment ID (GUID) + revision hash.
/// WP-16: Template state captured at flatten time -- last-write-wins on templates is safe.
/// </summary>
public class DeploymentService
{
    private readonly IDeploymentManagerRepository _repository;
    private readonly ISiteRepository _siteRepository;
    private readonly IFlatteningPipeline _flatteningPipeline;
    private readonly CommunicationService _communicationService;
    private readonly OperationLockManager _lockManager;
    private readonly IAuditService _auditService;
    private readonly DiffService _diffService;
    private readonly RevisionHashService _revisionHashService;
    private readonly IDeploymentStatusNotifier _statusNotifier;
    private readonly DeploymentManagerOptions _options;
    private readonly ILogger<DeploymentService> _logger;

    /// <summary>
    /// Prefix written to <see cref="DeploymentRecord.ErrorMessage"/> when a
    /// deployment fails because the site command timed out or was cancelled.
    /// Used by the query-before-redeploy trigger (DeploymentManager-006) to tell
    /// a timeout-induced failure apart from other deployment errors.
    /// </summary>
    private const string TimeoutFailurePrefix = "Communication failure:";

    /// <summary>
    /// Initializes a new instance of <see cref="DeploymentService"/> with all required dependencies.
    /// </summary>
    /// <param name="repository">Repository for deployment manager data access.</param>
    /// <param name="siteRepository">Repository for site data access.</param>
    /// <param name="flatteningPipeline">Pipeline for flattening and validating template configurations.</param>
    /// <param name="communicationService">Service for cross-cluster communication with sites.</param>
    /// <param name="lockManager">Manager for per-instance operation locks.</param>
    /// <param name="auditService">Service for recording audit log entries.</param>
    /// <param name="diffService">Service for computing configuration diffs.</param>
    /// <param name="revisionHashService">
    /// Service for recomputing a flattened configuration's revision hash. Used by
    /// <see cref="GetDeploymentComparisonAsync"/> to derive the deployed-side
    /// staleness hash from the (List-normalized) deserialized snapshot — see I-1.
    /// </param>
    /// <param name="statusNotifier">Notifier for pushing deployment status changes to the UI.</param>
    /// <param name="options">Deployment manager configuration options.</param>
    /// <param name="logger">Logger instance.</param>
    public DeploymentService(
        IDeploymentManagerRepository repository,
        ISiteRepository siteRepository,
        IFlatteningPipeline flatteningPipeline,
        CommunicationService communicationService,
        OperationLockManager lockManager,
        IAuditService auditService,
        DiffService diffService,
        RevisionHashService revisionHashService,
        IDeploymentStatusNotifier statusNotifier,
        IOptions<DeploymentManagerOptions> options,
        ILogger<DeploymentService> logger)
    {
        _repository = repository;
        _siteRepository = siteRepository;
        _flatteningPipeline = flatteningPipeline;
        _communicationService = communicationService;
        _lockManager = lockManager;
        _auditService = auditService;
        _diffService = diffService;
        _revisionHashService = revisionHashService;
        _statusNotifier = statusNotifier;
        _options = options.Value;
        _logger = logger;
    }

    /// <summary>
    /// CentralUI-006: raises a push notification that a deployment record's
    /// status was just persisted, so the Central UI deployment-status page can
    /// re-render over its SignalR circuit instead of polling. Called at every
    /// point a <see cref="DeploymentRecord"/> status is written.
    /// </summary>
    private void NotifyStatusChange(DeploymentRecord record) =>
        _statusNotifier.NotifyStatusChanged(
            new DeploymentStatusChange(record.DeploymentId, record.InstanceId, record.Status));

    /// <summary>
    /// Resolves the site's string identifier from the numeric DB ID.
    /// The communication layer routes by string identifier (e.g. "site-a"), not DB ID.
    ///
    /// DeploymentManager-021: when the <see cref="Site"/> row is missing (FK was
    /// deleted, race with admin delete, DB inconsistency) the previous behaviour
    /// silently substituted the numeric id rendered as a string — every
    /// downstream `CommunicationService` call then failed with a confusing
    /// "unknown site" routing error that hid the real cause. Treat a missing
    /// site row as a hard validation failure: throw
    /// <see cref="InvalidOperationException"/> naming the unresolved id so the
    /// operator sees the actual problem. On the deploy path the existing
    /// try/catch turns this into a Failed deployment record with a clear
    /// message; lifecycle paths propagate it to the caller (CLI/UI) which
    /// surface it as an error to the operator.
    /// </summary>
    private async Task<string> ResolveSiteIdentifierAsync(int siteId, CancellationToken cancellationToken)
    {
        var site = await _siteRepository.GetSiteByIdAsync(siteId, cancellationToken);
        if (site == null)
            throw new InvalidOperationException(
                $"Site with ID {siteId} not found; cannot resolve its SiteIdentifier for routing.");
        return site.SiteIdentifier;
    }

    /// <summary>
    /// WP-1: Deploy an instance to its site.
    /// WP-2: Generates unique deployment ID, computes revision hash.
    /// WP-4: Validates state transitions, uses optimistic concurrency.
    /// WP-5: Site-side apply is all-or-nothing (handled by DeploymentManagerActor).
    /// WP-8: Stores deployed config snapshot on success.
    /// WP-16: Captures template state at time of flatten.
    /// </summary>
    /// <param name="instanceId">The database ID of the instance to deploy.</param>
    /// <param name="user">The username initiating the deployment, recorded in the audit log.</param>
    /// <param name="cancellationToken">Cancellation token for the operation.</param>
    /// <returns>A task that resolves to a success result containing the deployment record, or a failure result with an error message.</returns>
    public async Task<Result<DeploymentRecord>> DeployInstanceAsync(
        int instanceId,
        string user,
        CancellationToken cancellationToken = default)
    {
        // Load instance
        var instance = await _repository.GetInstanceByIdAsync(instanceId, cancellationToken);
        if (instance == null)
            return Result<DeploymentRecord>.Failure($"Instance with ID {instanceId} not found.");

        // WP-4: Validate state transition
        var transitionError = StateTransitionValidator.ValidateTransition(instance.State, "deploy");
        if (transitionError != null)
            return Result<DeploymentRecord>.Failure(transitionError);

        // WP-3: Acquire per-instance operation lock
        using var lockHandle = await _lockManager.AcquireAsync(
            instance.UniqueName, _options.OperationLockTimeout, cancellationToken);

        // WP-2: Generate unique deployment ID
        var deploymentId = Guid.NewGuid().ToString("N");

        // WP-1/16: Flatten configuration (captures template state at this point in time)
        var flattenResult = await _flatteningPipeline.FlattenAndValidateAsync(instanceId, cancellationToken);
        if (flattenResult.IsFailure)
            return Result<DeploymentRecord>.Failure($"Validation failed: {flattenResult.Error}");

        var flattenedConfig = flattenResult.Value.Configuration;
        var revisionHash = flattenResult.Value.RevisionHash;
        var validationResult = flattenResult.Value.Validation;

        if (!validationResult.IsValid)
        {
            // Followup #8: return a grouped/summarized error (leading count + per-module
            // rollup, capped) instead of a flat semicolon-joined dump that becomes a wall
            // of text for instances with dozens of unbound attributes. The full per-entry
            // list still goes to the deploy log for operators who need every clause.
            _logger.LogWarning(
                "Pre-deployment validation failed for instance {InstanceId} ({ErrorCount} error(s)): {Detail}",
                instanceId,
                validationResult.Errors.Count,
                string.Join("; ", validationResult.Errors.Select(e => e.Message)));

            return Result<DeploymentRecord>.Failure(
                $"Pre-deployment validation failed: {validationResult.SummarizeErrors()}");
        }

        // Serialize for transmission (also the payload stored in the deployed
        // snapshot on success / reconciliation).
        var configJson = JsonSerializer.Serialize(flattenedConfig);

        // DeploymentManager-006: query-the-site-before-redeploy idempotency.
        // If a prior deployment for this instance is stuck InProgress or Failed
        // due to a timeout, the site may have actually applied the config. Query
        // the site for its currently-applied revision before re-sending so a
        // duplicate deployment is not produced (design: "Deployment Identity &
        // Idempotency"). A clean prior Success or a fresh first-time deploy
        // skips this extra round-trip.
        var reconciled = await TryReconcileWithSiteAsync(
            instance, revisionHash, configJson, user, cancellationToken);
        if (reconciled != null)
            return Result<DeploymentRecord>.Success(reconciled);

        // WP-4: Create the deployment record directly in InProgress.
        //
        // DeploymentManager-022: the previous code wrote the record as Pending,
        // then immediately updated it to InProgress with no work in between
        // (flattening, validation, and reconciliation all completed above). The
        // back-to-back write cost an extra SaveChangesAsync round-trip, an
        // extra IDeploymentStatusNotifier push (CentralUI-006 rendered a
        // Pending→InProgress flicker for ~ms), and an extra row-version bump
        // for nothing. The transient Pending slot carried no operational
        // meaning — it was set and immediately overwritten — so dropping it
        // collapses the start of the deploy into a single insert + notify.
        // InProgress remains the documented "sent to site, awaiting response"
        // state, set immediately before the round-trip below.
        var record = new DeploymentRecord(deploymentId, user)
        {
            InstanceId = instanceId,
            Status = DeploymentStatus.InProgress,
            RevisionHash = revisionHash,
            DeployedAt = DateTimeOffset.UtcNow
        };

        await _repository.AddDeploymentRecordAsync(record, cancellationToken);
        await _repository.SaveChangesAsync(cancellationToken);
        NotifyStatusChange(record);

        try
        {
            // WP-1: Send to site via CommunicationService
            var siteId = await ResolveSiteIdentifierAsync(instance.SiteId, cancellationToken);
            var command = new DeployInstanceCommand(
                deploymentId, instance.UniqueName, revisionHash, configJson, user, DateTimeOffset.UtcNow);

            _logger.LogInformation(
                "Sending deployment {DeploymentId} for instance {Instance} to site {SiteId}",
                deploymentId, instance.UniqueName, siteId);

            var response = await _communicationService.DeployInstanceAsync(siteId, command, cancellationToken);

            // WP-1: Update status based on site response.
            record.Status = response.Status;
            record.ErrorMessage = response.ErrorMessage;
            record.CompletedAt = DateTimeOffset.UtcNow;

            // DeploymentManager-003: once the site has confirmed the apply,
            // commit the deployment record's terminal status BEFORE touching
            // instance state and the deployed-config snapshot. If a later write
            // (instance update / snapshot store) fails, the recorded fact that
            // the site succeeded must NOT be lost -- otherwise central reports a
            // non-Success record while the site is running the new config.
            await _repository.UpdateDeploymentRecordAsync(record, cancellationToken);
            await _repository.SaveChangesAsync(cancellationToken);
            NotifyStatusChange(record);

            if (response.Status == DeploymentStatus.Success)
            {
                // Telemetry: one instance deployment successfully applied to a
                // site. Counted once per successful deploy operation (the unit
                // of scadabridge.deployments.applied — one DeployInstanceAsync
                // deploys exactly one instance to one site). Emitted only on this
                // confirmed-Success path, so failures, timeouts/retries (the
                // catch block), and the reconciliation path (which recovers a
                // PRIOR timed-out apply rather than performing a fresh one) do
                // not increment it.
                ScadaBridgeTelemetry.RecordDeploymentApplied();

                // The site has applied the deployment. The post-success
                // persistence below is best-effort: a failure here must be
                // logged loudly for operator reconciliation but must not flip
                // the already-committed Success record back to Failed.
                await ApplyPostSuccessSideEffectsAsync(
                    instance, deploymentId, revisionHash, configJson,
                    forceEnabledState: true, cancellationToken);
            }

            // Audit log
            await _auditService.LogAsync(user, "Deploy", "Instance", instanceId.ToString(),
                instance.UniqueName, new { DeploymentId = deploymentId, Status = record.Status.ToString() },
                cancellationToken);

            _logger.LogInformation(
                "Deployment {DeploymentId} for instance {Instance}: {Status}",
                deploymentId, instance.UniqueName, record.Status);

            return record.Status == DeploymentStatus.Success
                ? Result<DeploymentRecord>.Success(record)
                : Result<DeploymentRecord>.Failure(
                    $"Deployment failed: {response.ErrorMessage ?? "Unknown error"}");
        }
        catch (Exception ex)
        {
            // DeploymentManager-001: any exception out of the try (timeout,
            // cancellation, transport, serialization, DB) must leave the
            // deployment record as Failed -- the design requires an interrupted
            // deployment to be treated as failed, never stuck in InProgress.
            //
            // DeploymentManager-002: the failure-status write must NOT use the
            // operation's cancellation token. If the operation was cancelled or
            // timed out, that token is already cancelled and the cleanup writes
            // would themselves throw before the Failed status is persisted.
            // Use CancellationToken.None so the failure is durably recorded.
            var isTimeout = ex is TimeoutException or OperationCanceledException;

            record.Status = DeploymentStatus.Failed;
            record.ErrorMessage = isTimeout
                ? $"{TimeoutFailurePrefix} {ex.Message}"
                : $"Deployment error: {ex.Message}";
            record.CompletedAt = DateTimeOffset.UtcNow;

            try
            {
                await _repository.UpdateDeploymentRecordAsync(record, CancellationToken.None);
                await _repository.SaveChangesAsync(CancellationToken.None);
                NotifyStatusChange(record);

                await _auditService.LogAsync(user, "DeployFailed", "Instance", instanceId.ToString(),
                    instance.UniqueName, new { DeploymentId = deploymentId, Error = ex.Message },
                    CancellationToken.None);
            }
            catch (Exception cleanupEx)
            {
                // The deployment already failed; a failed cleanup write must not
                // mask the original error. Log loudly so an operator can reconcile.
                _logger.LogError(cleanupEx,
                    "Failed to persist Failed status for deployment {DeploymentId} of instance {Instance} " +
                    "after deployment error: {Error}",
                    deploymentId, instance.UniqueName, ex.Message);
            }

            _logger.LogError(ex,
                "Deployment {DeploymentId} for instance {Instance} failed",
                deploymentId, instance.UniqueName);

            return Result<DeploymentRecord>.Failure(
                isTimeout
                    ? $"Deployment timed out: {ex.Message}"
                    : $"Deployment failed: {ex.Message}");
        }
    }

    /// <summary>
    /// WP-6: Disable an instance. Stops Instance Actor, retains config, S&amp;F drains.
    /// </summary>
    /// <param name="instanceId">The database ID of the instance to disable.</param>
    /// <param name="user">The username initiating the operation, recorded in the audit log.</param>
    /// <param name="cancellationToken">Cancellation token for the operation.</param>
    /// <returns>A task that resolves to a success result with the site response, or a failure result with an error message.</returns>
    public async Task<Result<InstanceLifecycleResponse>> DisableInstanceAsync(
        int instanceId,
        string user,
        CancellationToken cancellationToken = default)
    {
        var instance = await _repository.GetInstanceByIdAsync(instanceId, cancellationToken);
        if (instance == null)
            return Result<InstanceLifecycleResponse>.Failure($"Instance with ID {instanceId} not found.");

        var transitionError = StateTransitionValidator.ValidateTransition(instance.State, "disable");
        if (transitionError != null)
            return Result<InstanceLifecycleResponse>.Failure(transitionError);

        using var lockHandle = await _lockManager.AcquireAsync(
            instance.UniqueName, _options.OperationLockTimeout, cancellationToken);

        var commandId = Guid.NewGuid().ToString("N");
        var siteId = await ResolveSiteIdentifierAsync(instance.SiteId, cancellationToken);
        var command = new DisableInstanceCommand(commandId, instance.UniqueName, DateTimeOffset.UtcNow);

        // WP-6: bound the round-trip with the configured lifecycle timeout so a
        // hung/unreachable site does not block the operation lock indefinitely.
        InstanceLifecycleResponse response;
        try
        {
            using var cts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken);
            cts.CancelAfter(_options.LifecycleCommandTimeout);
            response = await _communicationService.DisableInstanceAsync(siteId, command, cts.Token);
        }
        catch (Exception ex) when (ex is TimeoutException or OperationCanceledException)
        {
            // DeploymentManager-019: a lifecycle command timeout produced no
            // audit row pre-fix — the operator saw a timeout in the UI but
            // the audit trail showed nothing happened, contrary to the
            // design's "audit logging for all instance lifecycle changes"
            // rule. Mirror the DeployFailed pattern: write a "<Action>TimedOut"
            // entry with CancellationToken.None so a cancelled outer token
            // (the typical reason this catch ran) cannot prevent the
            // durable audit write.
            await TryLogLifecycleTimeoutAsync(
                user, "DisableTimedOut", instanceId, instance.UniqueName, commandId, ex);

            _logger.LogWarning(ex, "Disable of instance {Instance} timed out", instance.UniqueName);
            return Result<InstanceLifecycleResponse>.Failure(
                $"Disable failed: the site did not respond within {_options.LifecycleCommandTimeout}.");
        }

        if (response.Success)
        {
            instance.State = InstanceState.Disabled;
            await _repository.UpdateInstanceAsync(instance, cancellationToken);
            await _repository.SaveChangesAsync(cancellationToken);
        }

        await _auditService.LogAsync(user, "Disable", "Instance", instanceId.ToString(),
            instance.UniqueName, new { CommandId = commandId, response.Success },
            cancellationToken);

        return response.Success
            ? Result<InstanceLifecycleResponse>.Success(response)
            : Result<InstanceLifecycleResponse>.Failure(response.ErrorMessage ?? "Disable failed.");
    }

    /// <summary>
    /// WP-6: Enable an instance. Re-creates Instance Actor from stored config.
    /// </summary>
    /// <param name="instanceId">The database ID of the instance to enable.</param>
    /// <param name="user">The username initiating the operation, recorded in the audit log.</param>
    /// <param name="cancellationToken">Cancellation token for the operation.</param>
    /// <returns>A task that resolves to a success result with the site response, or a failure result with an error message.</returns>
    public async Task<Result<InstanceLifecycleResponse>> EnableInstanceAsync(
        int instanceId,
        string user,
        CancellationToken cancellationToken = default)
    {
        var instance = await _repository.GetInstanceByIdAsync(instanceId, cancellationToken);
        if (instance == null)
            return Result<InstanceLifecycleResponse>.Failure($"Instance with ID {instanceId} not found.");

        var transitionError = StateTransitionValidator.ValidateTransition(instance.State, "enable");
        if (transitionError != null)
            return Result<InstanceLifecycleResponse>.Failure(transitionError);

        using var lockHandle = await _lockManager.AcquireAsync(
            instance.UniqueName, _options.OperationLockTimeout, cancellationToken);

        var commandId = Guid.NewGuid().ToString("N");
        var siteId = await ResolveSiteIdentifierAsync(instance.SiteId, cancellationToken);
        var command = new EnableInstanceCommand(commandId, instance.UniqueName, DateTimeOffset.UtcNow);

        // WP-6: bound the round-trip with the configured lifecycle timeout.
        InstanceLifecycleResponse response;
        try
        {
            using var cts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken);
            cts.CancelAfter(_options.LifecycleCommandTimeout);
            response = await _communicationService.EnableInstanceAsync(siteId, command, cts.Token);
        }
        catch (Exception ex) when (ex is TimeoutException or OperationCanceledException)
        {
            // DeploymentManager-019: emit an audit entry on lifecycle timeout
            // so the operator's attempted Enable is recorded; see the matching
            // comment in DisableInstanceAsync for the full rationale.
            await TryLogLifecycleTimeoutAsync(
                user, "EnableTimedOut", instanceId, instance.UniqueName, commandId, ex);

            _logger.LogWarning(ex, "Enable of instance {Instance} timed out", instance.UniqueName);
            return Result<InstanceLifecycleResponse>.Failure(
                $"Enable failed: the site did not respond within {_options.LifecycleCommandTimeout}.");
        }

        if (response.Success)
        {
            instance.State = InstanceState.Enabled;
            await _repository.UpdateInstanceAsync(instance, cancellationToken);
            await _repository.SaveChangesAsync(cancellationToken);
        }

        await _auditService.LogAsync(user, "Enable", "Instance", instanceId.ToString(),
            instance.UniqueName, new { CommandId = commandId, response.Success },
            cancellationToken);

        return response.Success
            ? Result<InstanceLifecycleResponse>.Success(response)
            : Result<InstanceLifecycleResponse>.Failure(response.ErrorMessage ?? "Enable failed.");
    }

    /// <summary>
    /// WP-6: Delete an instance. Stops the site actor, removes site config, and
    /// removes the central instance record (deployment history, snapshot,
    /// overrides, and connection bindings go with it). S&amp;F NOT cleared.
    /// Delete fails if the site is unreachable within
    /// <c>CommunicationOptions.LifecycleTimeout</c> (applied inside
    /// <see cref="CommunicationService.DeleteInstanceAsync"/>).
    /// </summary>
    /// <param name="instanceId">The database ID of the instance to delete.</param>
    /// <param name="user">The username initiating the deletion, recorded in the audit log.</param>
    /// <param name="cancellationToken">Cancellation token for the operation.</param>
    /// <returns>A task that resolves to a success result with the site response, or a failure result with an error message.</returns>
    public async Task<Result<InstanceLifecycleResponse>> DeleteInstanceAsync(
        int instanceId,
        string user,
        CancellationToken cancellationToken = default)
    {
        var instance = await _repository.GetInstanceByIdAsync(instanceId, cancellationToken);
        if (instance == null)
            return Result<InstanceLifecycleResponse>.Failure($"Instance with ID {instanceId} not found.");

        var transitionError = StateTransitionValidator.ValidateTransition(instance.State, "delete");
        if (transitionError != null)
            return Result<InstanceLifecycleResponse>.Failure(transitionError);

        using var lockHandle = await _lockManager.AcquireAsync(
            instance.UniqueName, _options.OperationLockTimeout, cancellationToken);

        var commandId = Guid.NewGuid().ToString("N");
        var siteId = await ResolveSiteIdentifierAsync(instance.SiteId, cancellationToken);
        var command = new DeleteInstanceCommand(commandId, instance.UniqueName, DateTimeOffset.UtcNow);

        // WP-6: bound the round-trip with the configured lifecycle timeout.
        InstanceLifecycleResponse response;
        try
        {
            using var cts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken);
            cts.CancelAfter(_options.LifecycleCommandTimeout);
            response = await _communicationService.DeleteInstanceAsync(siteId, command, cts.Token);
        }
        catch (Exception ex) when (ex is TimeoutException or OperationCanceledException)
        {
            // DeploymentManager-019: emit an audit entry on lifecycle timeout
            // so the operator's attempted Delete is recorded; see the matching
            // comment in DisableInstanceAsync for the full rationale.
            await TryLogLifecycleTimeoutAsync(
                user, "DeleteTimedOut", instanceId, instance.UniqueName, commandId, ex);

            _logger.LogWarning(ex, "Delete of instance {Instance} timed out", instance.UniqueName);
            return Result<InstanceLifecycleResponse>.Failure(
                $"Delete failed: the site did not respond within {_options.LifecycleCommandTimeout}.");
        }

        if (response.Success)
        {
            // Delete means delete: remove the instance record entirely.
            // Deployment records, snapshot, overrides, and connection bindings
            // are removed with it (see repository implementation).
            //
            // DeploymentManager-004: the site has already destroyed the Instance
            // Actor and removed its config. If the central record removal now
            // fails (DB error / concurrency), the exception must NOT escape
            // uncaught -- that would leave the central record orphaned and
            // un-deletable through the normal path (a re-issued delete may fail
            // because the site no longer has the instance). Surface a distinct
            // failure so an operator can reconcile.
            try
            {
                await _repository.DeleteInstanceAsync(instanceId, cancellationToken);
                await _repository.SaveChangesAsync(cancellationToken);
            }
            catch (Exception ex)
            {
                _logger.LogError(ex,
                    "Instance {Instance} was deleted at the site, but the central record could not be " +
                    "removed -- the central record is now orphaned and must be reconciled manually",
                    instance.UniqueName);

                await _auditService.LogAsync(user, "DeleteOrphaned", "Instance", instanceId.ToString(),
                    instance.UniqueName, new { CommandId = commandId, Error = ex.Message },
                    CancellationToken.None);

                return Result<InstanceLifecycleResponse>.Failure(
                    $"The site deleted instance '{instance.UniqueName}', but the central record could not " +
                    $"be removed: {ex.Message}. The central record is orphaned and must be reconciled.");
            }
        }

        await _auditService.LogAsync(user, "Delete", "Instance", instanceId.ToString(),
            instance.UniqueName, new { CommandId = commandId, response.Success },
            cancellationToken);

        return response.Success
            ? Result<InstanceLifecycleResponse>.Success(response)
            : Result<InstanceLifecycleResponse>.Failure(
                response.ErrorMessage ?? "Delete failed. Site may be unreachable.");
    }

    /// <summary>
    /// WP-8: Get the deployed config snapshot and compare with current
    /// template-derived state. Produces both a staleness flag and — per the
    /// design's "Diff View" — a structured <see cref="ConfigurationDiff"/> of
    /// added/removed/changed attributes, alarms, and scripts (including data
    /// connection binding changes) computed by the TemplateEngine
    /// <see cref="DiffService"/>.
    /// </summary>
    /// <param name="instanceId">The database ID of the instance to compare.</param>
    /// <param name="cancellationToken">Cancellation token for the operation.</param>
    /// <returns>A task that resolves to a success result with the comparison, or a failure result if no snapshot exists.</returns>
    public async Task<Result<DeploymentComparisonResult>> GetDeploymentComparisonAsync(
        int instanceId,
        CancellationToken cancellationToken = default)
    {
        var snapshot = await _repository.GetDeployedSnapshotByInstanceIdAsync(instanceId, cancellationToken);
        if (snapshot == null)
            return Result<DeploymentComparisonResult>.Failure("No deployed snapshot found for this instance.");

        // Compute current template-derived config
        var currentResult = await _flatteningPipeline.FlattenAndValidateAsync(instanceId, cancellationToken);
        if (currentResult.IsFailure)
            return Result<DeploymentComparisonResult>.Failure($"Cannot compute current config: {currentResult.Error}");

        var currentConfig = currentResult.Value.Configuration;
        var currentHash = currentResult.Value.RevisionHash;

        // I-1 (latent): the snapshot's ConfigurationJson + RevisionHash froze the
        // FLATTENED config at deploy time. The current config is a FRESH flatten,
        // now always in native List form (#93 consolidated element-type/coercion
        // into AttributeValueCodec, which emits native-form JSON arrays). A List
        // attribute deployed in the OLD quoted form (e.g. ["10","20"]) therefore
        // both (a) hashes differently from the native re-flatten — a spurious
        // stale flag — and (b) shows a spurious Changed attribute in the diff
        // (DiffService.AttributesEqual is an ordinal Value comparison). Normalize
        // the deserialized snapshot's List values through AttributeValueCodec
        // Decode→Encode so an old-form value becomes native form and compares
        // equal to the native re-flatten, then drive BOTH the staleness hash and
        // the diff off that normalized snapshot. Scalars are left untouched.
        //
        // DeploymentManager-007: a snapshot that cannot be deserialized (corrupt /
        // older schema) still yields the frozen-hash staleness result, with a
        // null diff.
        var deployedRevisionHash = snapshot.RevisionHash;
        ConfigurationDiff? diff = null;
        try
        {
            var deployedConfig = JsonSerializer.Deserialize<FlattenedConfiguration>(snapshot.ConfigurationJson);
            if (deployedConfig != null)
            {
                deployedConfig = NormalizeListAttributeValues(deployedConfig);

                // Recompute the deployed-side hash from the normalized snapshot so
                // an old-form List value is not flagged stale against the native
                // re-flatten. For a faithfully-stored scalar-only snapshot this
                // reproduces the frozen RevisionHash exactly, so behaviour is
                // unchanged outside the List-normalization case.
                deployedRevisionHash = _revisionHashService.ComputeHash(deployedConfig);

                diff = _diffService.ComputeDiff(
                    deployedConfig, currentConfig, deployedRevisionHash, currentHash);
            }
            else
            {
                _logger.LogWarning(
                    "Deployed snapshot for instance {InstanceId} deserialized to null; " +
                    "returning hash-based comparison without a structured diff",
                    instanceId);
            }
        }
        catch (JsonException ex)
        {
            _logger.LogWarning(ex,
                "Could not deserialize deployed snapshot for instance {InstanceId}; " +
                "returning hash-based comparison without a structured diff",
                instanceId);
        }

        var isStale = deployedRevisionHash != currentHash;

        var result = new DeploymentComparisonResult(
            instanceId,
            deployedRevisionHash,
            currentHash,
            isStale,
            snapshot.DeployedAt,
            diff);

        return Result<DeploymentComparisonResult>.Success(result);
    }

    /// <summary>
    /// I-1 (latent): returns a copy of <paramref name="config"/> whose
    /// <see cref="DataType.List"/> attribute values have been round-tripped through
    /// <see cref="AttributeValueCodec.Decode"/> → <see cref="AttributeValueCodec.Encode"/>
    /// (native JSON-array form). This normalizes a value deployed in the OLD quoted
    /// form (e.g. <c>["10","20"]</c>) to the native form (<c>[10,20]</c>) the current
    /// flattener now produces, so the staleness hash and the structured diff do not
    /// report a spurious change. Scalar / string attributes are returned unchanged
    /// (only <see cref="DataType.List"/> is normalized). A value that cannot be
    /// decoded (malformed JSON, bad element, or an unparseable element type) is left
    /// as-is — a normalization failure must never break the read-only comparison.
    /// </summary>
    private ResolvedAttribute NormalizeListAttribute(ResolvedAttribute attr)
    {
        if (!string.Equals(attr.DataType, nameof(DataType.List), StringComparison.OrdinalIgnoreCase)
            || string.IsNullOrEmpty(attr.Value))
        {
            return attr;
        }

        if (!Enum.TryParse<DataType>(attr.ElementDataType, ignoreCase: true, out var elementType)
            || !AttributeValueCodec.IsValidElementType(elementType))
        {
            return attr;
        }

        try
        {
            var normalized = AttributeValueCodec.Encode(
                AttributeValueCodec.Decode(attr.Value, DataType.List, elementType));
            return normalized == attr.Value ? attr : attr with { Value = normalized };
        }
        catch (FormatException ex)
        {
            // Best-effort: a snapshot value that no longer round-trips is left
            // untouched rather than aborting the comparison. Logged so an operator
            // can investigate the stored value.
            _logger.LogWarning(ex,
                "Could not normalize List attribute '{Attribute}' in deployed snapshot; " +
                "comparing its stored value verbatim",
                attr.CanonicalName);
            return attr;
        }
    }

    /// <summary>
    /// I-1 (latent): applies <see cref="NormalizeListAttribute"/> to every attribute
    /// in <paramref name="config"/>, returning the original instance unchanged when
    /// no List value needed normalizing (the common scalar-only case).
    /// </summary>
    private FlattenedConfiguration NormalizeListAttributeValues(FlattenedConfiguration config)
    {
        if (config.Attributes.Count == 0)
            return config;

        var normalized = config.Attributes.Select(NormalizeListAttribute).ToList();
        var changed = normalized.Where((a, i) => !ReferenceEquals(a, config.Attributes[i])).Any();
        return changed ? config with { Attributes = normalized } : config;
    }

    /// <summary>
    /// WP-2: Returns the current persisted <see cref="DeploymentRecord"/> for
    /// the given deployment ID from the configuration database. This is a pure
    /// local DB read — it does not contact the site. The query-the-site-before-
    /// redeploy reconciliation (design: "Deployment Identity &amp; Idempotency")
    /// lives in <see cref="TryReconcileWithSiteAsync"/>, which
    /// <see cref="DeployInstanceAsync"/> invokes on the deploy path.
    /// </summary>
    /// <param name="deploymentId">The unique deployment identifier to look up.</param>
    /// <param name="cancellationToken">Cancellation token for the operation.</param>
    /// <returns>A task that resolves to the matching deployment record, or null if none exists.</returns>
    public async Task<DeploymentRecord?> GetDeploymentStatusAsync(
        string deploymentId,
        CancellationToken cancellationToken = default)
    {
        return await _repository.GetDeploymentByDeploymentIdAsync(deploymentId, cancellationToken);
    }

    /// <summary>
    /// DeploymentManager-006: query-the-site-before-redeploy reconciliation.
    ///
    /// The site query is issued ONLY when a prior <see cref="DeploymentRecord"/>
    /// for this instance is stuck <see cref="DeploymentStatus.InProgress"/>, or
    /// is <see cref="DeploymentStatus.Failed"/> due to a timeout — the only
    /// cases where the site may have applied the config without central
    /// learning of it. Fresh first-time deploys and redeploys after a clean
    /// prior <see cref="DeploymentStatus.Success"/> skip the extra round-trip.
    ///
    /// Reconciliation: if the site already has the TARGET revision hash, the
    /// prior record is marked <see cref="DeploymentStatus.Success"/> (with its
    /// <see cref="DeploymentRecord.RevisionHash"/> corrected to the target —
    /// DeploymentManager-016) and returned (the caller must NOT re-send the
    /// deploy). The same post-success side effects as the normal deploy path
    /// are applied — instance <see cref="InstanceState.Enabled"/> and a stored
    /// <see cref="DeployedConfigSnapshot"/> (DeploymentManager-015) — so central
    /// and site state do not diverge. Otherwise <c>null</c> is returned and the
    /// normal deploy proceeds.
    ///
    /// Query failure: if the site is unreachable or the query times out, this
    /// returns <c>null</c> (fall through to a normal deploy) — site-side
    /// stale-rejection of an older revision hash is the safety net. The deploy
    /// is never aborted on a failed query.
    /// </summary>
    private async Task<DeploymentRecord?> TryReconcileWithSiteAsync(
        Instance instance,
        string targetRevisionHash,
        string configJson,
        string currentUser,
        CancellationToken cancellationToken)
    {
        var prior = await _repository.GetCurrentDeploymentStatusAsync(instance.Id, cancellationToken);
        if (prior == null || !ShouldQuerySiteBeforeRedeploy(prior))
            return null;

        DeploymentStateQueryResponse response;
        try
        {
            var siteId = await ResolveSiteIdentifierAsync(instance.SiteId, cancellationToken);
            var query = new DeploymentStateQueryRequest(
                Guid.NewGuid().ToString("N"), instance.UniqueName, DateTimeOffset.UtcNow);

            _logger.LogInformation(
                "Querying site {SiteId} for applied deployment state of instance {Instance} " +
                "before re-deploy (prior record {DeploymentId} is {Status})",
                siteId, instance.UniqueName, prior.DeploymentId, prior.Status);

            response = await _communicationService.QueryDeploymentStateAsync(
                siteId, query, cancellationToken);
        }
        catch (Exception ex)
        {
            // Query failure (site unreachable / timeout): do NOT abort. Fall
            // through to a normal deploy; site-side stale-rejection of an older
            // revision hash is the safety net.
            _logger.LogWarning(ex,
                "Site query before re-deploy of instance {Instance} failed; " +
                "proceeding with normal deploy (site-side stale-rejection is the safety net)",
                instance.UniqueName);
            return null;
        }

        if (response.IsDeployed &&
            string.Equals(response.AppliedRevisionHash, targetRevisionHash, StringComparison.Ordinal))
        {
            // The site already has the target revision — the prior deployment
            // actually succeeded. Reconcile the stale record instead of
            // re-sending the deploy.
            _logger.LogInformation(
                "Site already has target revision {RevisionHash} for instance {Instance}; " +
                "marking prior deployment record {DeploymentId} Success without re-deploying",
                targetRevisionHash, instance.UniqueName, prior.DeploymentId);

            prior.Status = DeploymentStatus.Success;
            prior.ErrorMessage = null;
            prior.CompletedAt = DateTimeOffset.UtcNow;
            // DeploymentManager-016: the prior record can legitimately carry a
            // different (stale) revision hash than the current target. The site
            // confirmed it is running the target revision, so the persisted
            // record, the audit entry below, and the site must all agree.
            prior.RevisionHash = targetRevisionHash;
            await _repository.UpdateDeploymentRecordAsync(prior, cancellationToken);
            await _repository.SaveChangesAsync(cancellationToken);
            NotifyStatusChange(prior);

            // DeploymentManager-015: a reconciled deployment must perform the
            // SAME post-success side effects as the normal deploy path — set
            // the instance State to Enabled and store/refresh the deployed
            // config snapshot — otherwise the central state machine and the
            // deployed-snapshot invariant diverge from what the site is running.
            //
            // DeploymentManager-018: the reconciliation path runs only when the
            // prior record is InProgress or timeout-Failed — exactly the cases
            // that survive a central failover. The in-memory operation lock is
            // lost on failover, so an operator may have legitimately invoked
            // Disable on the instance between the original timed-out deploy and
            // this redeploy. Disable does not change the deployed config, so the
            // site still reports the target revision hash. Reconciliation must
            // therefore PRESERVE an intentional Disabled state instead of
            // silently flipping it back to Enabled — pass forceEnabledState:
            // false so the helper only promotes NotDeployed → Enabled (the
            // first-deploy-timed-out case) and leaves an explicit Disabled
            // alone.
            await ApplyPostSuccessSideEffectsAsync(
                instance, prior.DeploymentId, targetRevisionHash, configJson,
                forceEnabledState: false, cancellationToken);

            // DeploymentManager-020: attribute the audit row to the user driving
            // THIS redeploy (the caller of DeployInstanceAsync), not the user
            // who issued the original timed-out / stuck deployment. The original
            // deployer is preserved in the detail object so forensics can still
            // see who launched the run that reconciliation rescued.
            await _auditService.LogAsync(currentUser, "DeployReconciled", "Instance",
                instance.Id.ToString(), instance.UniqueName,
                new
                {
                    DeploymentId = prior.DeploymentId,
                    RevisionHash = targetRevisionHash,
                    OriginalDeployer = prior.DeployedBy
                },
                cancellationToken);

            return prior;
        }

        // Site does not have the target revision (or is not deployed) — proceed
        // with the normal deploy.
        return null;
    }

    /// <summary>
    /// DeploymentManager-006: the site is queried before a re-deploy only when a
    /// prior record is stuck <see cref="DeploymentStatus.InProgress"/>, or is
    /// <see cref="DeploymentStatus.Failed"/> because the site command timed out
    /// (detected via the <see cref="TimeoutFailurePrefix"/> error-message
    /// marker). All other prior states skip the query.
    /// </summary>
    private static bool ShouldQuerySiteBeforeRedeploy(DeploymentRecord prior) =>
        prior.Status == DeploymentStatus.InProgress
        || (prior.Status == DeploymentStatus.Failed
            && prior.ErrorMessage != null
            && prior.ErrorMessage.StartsWith(TimeoutFailurePrefix, StringComparison.Ordinal));

    /// <summary>
    /// Post-success side effects shared by the normal deploy path and the
    /// DeploymentManager-006 reconciliation path: set the instance
    /// <see cref="InstanceState.Enabled"/> (WP-4) and store/refresh the
    /// deployed config snapshot (WP-8). Factored into one helper so the two
    /// paths cannot drift (DeploymentManager-015).
    ///
    /// DeploymentManager-018: <paramref name="forceEnabledState"/> distinguishes
    /// the two callers. The normal deploy path passes <c>true</c> — a fresh
    /// successful apply legitimately puts the instance into <see cref="InstanceState.Enabled"/>
    /// (the documented "Deploy on a Disabled instance also enables it" semantics
    /// of <see cref="StateTransitionValidator"/>). The reconciliation path
    /// passes <c>false</c>: it is reconciling a *prior* deployment that may
    /// have completed before the current operator session (central failover
    /// loses the in-memory operation lock, so an operator may have legitimately
    /// Disabled the instance in between). On that path we only promote
    /// <see cref="InstanceState.NotDeployed"/> → <see cref="InstanceState.Enabled"/>
    /// (the first-deploy-timed-out case) and leave an explicit Disabled alone,
    /// so reconciliation never silently undoes a Disable.
    ///
    /// Best-effort: the deployment record's terminal <see cref="DeploymentStatus.Success"/>
    /// status is already committed by the caller before this runs. A failure
    /// here is logged loudly for operator reconciliation but is NOT propagated —
    /// it must not flip the already-committed Success record back to Failed.
    /// </summary>
    private async Task ApplyPostSuccessSideEffectsAsync(
        Instance instance,
        string deploymentId,
        string revisionHash,
        string configJson,
        bool forceEnabledState,
        CancellationToken cancellationToken)
    {
        try
        {
            // WP-4: Update instance state to Enabled on successful deployment.
            // DeploymentManager-018: on the reconciliation path
            // (forceEnabledState=false) only promote NotDeployed → Enabled,
            // preserving an intentional Disabled state set between the original
            // timed-out deploy and the redeploy.
            if (forceEnabledState || instance.State == InstanceState.NotDeployed)
            {
                instance.State = InstanceState.Enabled;
            }
            await _repository.UpdateInstanceAsync(instance, cancellationToken);

            // WP-8: Store deployed config snapshot
            await StoreDeployedSnapshotAsync(
                instance.Id, deploymentId, revisionHash, configJson, cancellationToken);

            await _repository.SaveChangesAsync(cancellationToken);
        }
        catch (Exception postEx)
        {
            _logger.LogError(postEx,
                "Deployment {DeploymentId} for instance {Instance} was applied by the site and " +
                "recorded Success, but post-success persistence (instance state / config snapshot) " +
                "failed -- central and site state may diverge until reconciled",
                deploymentId, instance.UniqueName);
        }
    }

    /// <summary>
    /// DeploymentManager-019: write a "&lt;Action&gt;TimedOut" audit entry on
    /// behalf of a lifecycle command (Disable / Enable / Delete) whose site
    /// round-trip exceeded <see cref="DeploymentManagerOptions.LifecycleCommandTimeout"/>.
    ///
    /// <para>
    /// Mirrors the <c>DeployFailed</c> pattern in
    /// <see cref="DeployInstanceAsync"/>: the audit write uses
    /// <see cref="CancellationToken.None"/> so the operator's outer cancellation
    /// (the usual reason this path runs) cannot also prevent the audit row from
    /// being persisted. The detail object carries the lifecycle command id, the
    /// timeout that fired, and the original exception message so an operator can
    /// correlate the audit entry with the UI-surfaced timeout error.
    /// </para>
    ///
    /// <para>
    /// Wrapped in try/catch — a failed audit write must NOT mask the underlying
    /// timeout from the caller; it is logged at Warning so the operator can
    /// reconcile but never thrown.
    /// </para>
    /// </summary>
    /// <param name="user">The username who initiated the lifecycle command.</param>
    /// <param name="action">The audit action name (<c>DisableTimedOut</c>, <c>EnableTimedOut</c>, or <c>DeleteTimedOut</c>).</param>
    /// <param name="instanceId">The numeric instance id, recorded on the audit row.</param>
    /// <param name="instanceUniqueName">The instance unique name used as the audit target name.</param>
    /// <param name="commandId">The lifecycle command's correlation id, so the audit entry can be matched to logs.</param>
    /// <param name="timeoutException">The captured <see cref="TimeoutException"/> or <see cref="OperationCanceledException"/>.</param>
    private async Task TryLogLifecycleTimeoutAsync(
        string user,
        string action,
        int instanceId,
        string instanceUniqueName,
        string commandId,
        Exception timeoutException)
    {
        try
        {
            await _auditService.LogAsync(
                user,
                action,
                "Instance",
                instanceId.ToString(),
                instanceUniqueName,
                new
                {
                    CommandId = commandId,
                    Deadline = _options.LifecycleCommandTimeout,
                    Error = timeoutException.Message,
                },
                CancellationToken.None);
        }
        catch (Exception auditEx)
        {
            // A failed audit write must not bury the timeout for the caller —
            // just log so an operator can investigate the audit-pipeline issue.
            _logger.LogWarning(auditEx,
                "Failed to write {Action} audit entry for instance {Instance} (commandId={CommandId})",
                action, instanceUniqueName, commandId);
        }
    }

    private async Task StoreDeployedSnapshotAsync(
        int instanceId,
        string deploymentId,
        string revisionHash,
        string configJson,
        CancellationToken cancellationToken)
    {
        var existing = await _repository.GetDeployedSnapshotByInstanceIdAsync(instanceId, cancellationToken);
        if (existing != null)
        {
            existing.DeploymentId = deploymentId;
            existing.RevisionHash = revisionHash;
            existing.ConfigurationJson = configJson;
            existing.DeployedAt = DateTimeOffset.UtcNow;
            await _repository.UpdateDeployedSnapshotAsync(existing, cancellationToken);
        }
        else
        {
            var snapshot = new DeployedConfigSnapshot(deploymentId, revisionHash, configJson)
            {
                InstanceId = instanceId,
                DeployedAt = DateTimeOffset.UtcNow
            };
            await _repository.AddDeployedSnapshotAsync(snapshot, cancellationToken);
        }
    }
}

/// <summary>
/// WP-8: Result of comparing deployed vs template-derived configuration.
/// </summary>
/// <param name="Diff">
/// DeploymentManager-007: structured added/removed/changed detail for
/// attributes, alarms, and scripts. Null only when the deployed snapshot could
/// not be deserialized (corrupt / older schema), in which case
/// <see cref="IsStale"/> still reflects the hash comparison.
/// </param>
public record DeploymentComparisonResult(
    int InstanceId,
    string DeployedRevisionHash,
    string CurrentRevisionHash,
    bool IsStale,
    DateTimeOffset DeployedAt,
    ConfigurationDiff? Diff = null);