refactor: rename ScadaLink → ZB.MOM.WW.ScadaBridge (code + projects + namespaces)

Solution + 23 src projects + 26 test projects renamed; folders, csproj, namespaces, and ScadaLinkDbContext/ScadaBridgeDbContext class updated. ActorSystem "scadalink" → "scadabridge", Akka seed-node URLs migrated. SQL roles/logins, LDAP domains, CLI command name, and CLI config dir (~/.scadalink → ~/.scadabridge) also renamed. Build green; 5 Host.Tests fail awaiting SQL login rename in next commit. Pre-existing StaleTagMonitor timing flakes unchanged. Rename script committed at tools/rename-to-scadabridge.sh.
2026-05-28 09:37:45 -04:00
parent 6d87ee3c3b
commit 7b0b9c7365
1531 changed files with 11180 additions and 11054 deletions
@@ -0,0 +1,945 @@
+using System.Text.Json;
+using Microsoft.Extensions.Logging;
+using Microsoft.Extensions.Options;
+using ZB.MOM.WW.ScadaBridge.Commons.Entities.Deployment;
+using ZB.MOM.WW.ScadaBridge.Commons.Entities.Instances;
+using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Repositories;
+using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Services;
+using ZB.MOM.WW.ScadaBridge.Commons.Messages.Deployment;
+using ZB.MOM.WW.ScadaBridge.Commons.Messages.Lifecycle;
+using ZB.MOM.WW.ScadaBridge.Commons.Types;
+using ZB.MOM.WW.ScadaBridge.Commons.Types.Enums;
+using ZB.MOM.WW.ScadaBridge.Commons.Types.Flattening;
+using ZB.MOM.WW.ScadaBridge.Communication;
+using ZB.MOM.WW.ScadaBridge.TemplateEngine.Flattening;
+using ZB.MOM.WW.ScadaBridge.TemplateEngine.Validation;
+
+namespace ZB.MOM.WW.ScadaBridge.DeploymentManager;
+
+/// <summary>
+/// WP-1: Central-side deployment orchestration service.
+/// Coordinates the full deployment pipeline:
+///   1. Validate instance state transition (WP-4)
+///   2. Acquire per-instance operation lock (WP-3)
+///   3. Flatten configuration via TemplateEngine (captures template state at time of flatten -- WP-16)
+///   4. Validate flattened configuration
+///   5. Compute revision hash and diff
+///   6. Send DeployInstanceCommand to site via CommunicationService
+///   7. Track deployment status with optimistic concurrency (WP-4)
+///   8. Store deployed config snapshot (WP-8)
+///   9. Audit log all actions
+///
+/// WP-2: Each deployment has a unique deployment ID (GUID) + revision hash.
+/// WP-16: Template state captured at flatten time -- last-write-wins on templates is safe.
+/// </summary>
+public class DeploymentService
+{
+    private readonly IDeploymentManagerRepository _repository;
+    private readonly ISiteRepository _siteRepository;
+    private readonly IFlatteningPipeline _flatteningPipeline;
+    private readonly CommunicationService _communicationService;
+    private readonly OperationLockManager _lockManager;
+    private readonly IAuditService _auditService;
+    private readonly DiffService _diffService;
+    private readonly IDeploymentStatusNotifier _statusNotifier;
+    private readonly DeploymentManagerOptions _options;
+    private readonly ILogger<DeploymentService> _logger;
+
+    /// <summary>
+    /// Prefix written to <see cref="DeploymentRecord.ErrorMessage"/> when a
+    /// deployment fails because the site command timed out or was cancelled.
+    /// Used by the query-before-redeploy trigger (DeploymentManager-006) to tell
+    /// a timeout-induced failure apart from other deployment errors.
+    /// </summary>
+    private const string TimeoutFailurePrefix = "Communication failure:";
+
+    /// <summary>
+    /// Initializes a new instance of <see cref="DeploymentService"/> with all required dependencies.
+    /// </summary>
+    /// <param name="repository">Repository for deployment manager data access.</param>
+    /// <param name="siteRepository">Repository for site data access.</param>
+    /// <param name="flatteningPipeline">Pipeline for flattening and validating template configurations.</param>
+    /// <param name="communicationService">Service for cross-cluster communication with sites.</param>
+    /// <param name="lockManager">Manager for per-instance operation locks.</param>
+    /// <param name="auditService">Service for recording audit log entries.</param>
+    /// <param name="diffService">Service for computing configuration diffs.</param>
+    /// <param name="statusNotifier">Notifier for pushing deployment status changes to the UI.</param>
+    /// <param name="options">Deployment manager configuration options.</param>
+    /// <param name="logger">Logger instance.</param>
+    public DeploymentService(
+        IDeploymentManagerRepository repository,
+        ISiteRepository siteRepository,
+        IFlatteningPipeline flatteningPipeline,
+        CommunicationService communicationService,
+        OperationLockManager lockManager,
+        IAuditService auditService,
+        DiffService diffService,
+        IDeploymentStatusNotifier statusNotifier,
+        IOptions<DeploymentManagerOptions> options,
+        ILogger<DeploymentService> logger)
+    {
+        _repository = repository;
+        _siteRepository = siteRepository;
+        _flatteningPipeline = flatteningPipeline;
+        _communicationService = communicationService;
+        _lockManager = lockManager;
+        _auditService = auditService;
+        _diffService = diffService;
+        _statusNotifier = statusNotifier;
+        _options = options.Value;
+        _logger = logger;
+    }
+
+    /// <summary>
+    /// CentralUI-006: raises a push notification that a deployment record's
+    /// status was just persisted, so the Central UI deployment-status page can
+    /// re-render over its SignalR circuit instead of polling. Called at every
+    /// point a <see cref="DeploymentRecord"/> status is written.
+    /// </summary>
+    private void NotifyStatusChange(DeploymentRecord record) =>
+        _statusNotifier.NotifyStatusChanged(
+            new DeploymentStatusChange(record.DeploymentId, record.InstanceId, record.Status));
+
+    /// <summary>
+    /// Resolves the site's string identifier from the numeric DB ID.
+    /// The communication layer routes by string identifier (e.g. "site-a"), not DB ID.
+    ///
+    /// DeploymentManager-021: when the <see cref="Site"/> row is missing (FK was
+    /// deleted, race with admin delete, DB inconsistency) the previous behaviour
+    /// silently substituted the numeric id rendered as a string — every
+    /// downstream `CommunicationService` call then failed with a confusing
+    /// "unknown site" routing error that hid the real cause. Treat a missing
+    /// site row as a hard validation failure: throw
+    /// <see cref="InvalidOperationException"/> naming the unresolved id so the
+    /// operator sees the actual problem. On the deploy path the existing
+    /// try/catch turns this into a Failed deployment record with a clear
+    /// message; lifecycle paths propagate it to the caller (CLI/UI) which
+    /// surface it as an error to the operator.
+    /// </summary>
+    private async Task<string> ResolveSiteIdentifierAsync(int siteId, CancellationToken cancellationToken)
+    {
+        var site = await _siteRepository.GetSiteByIdAsync(siteId, cancellationToken);
+        if (site == null)
+            throw new InvalidOperationException(
+                $"Site with ID {siteId} not found; cannot resolve its SiteIdentifier for routing.");
+        return site.SiteIdentifier;
+    }
+
+    /// <summary>
+    /// WP-1: Deploy an instance to its site.
+    /// WP-2: Generates unique deployment ID, computes revision hash.
+    /// WP-4: Validates state transitions, uses optimistic concurrency.
+    /// WP-5: Site-side apply is all-or-nothing (handled by DeploymentManagerActor).
+    /// WP-8: Stores deployed config snapshot on success.
+    /// WP-16: Captures template state at time of flatten.
+    /// </summary>
+    /// <param name="instanceId">The database ID of the instance to deploy.</param>
+    /// <param name="user">The username initiating the deployment, recorded in the audit log.</param>
+    /// <param name="cancellationToken">Cancellation token for the operation.</param>
+    public async Task<Result<DeploymentRecord>> DeployInstanceAsync(
+        int instanceId,
+        string user,
+        CancellationToken cancellationToken = default)
+    {
+        // Load instance
+        var instance = await _repository.GetInstanceByIdAsync(instanceId, cancellationToken);
+        if (instance == null)
+            return Result<DeploymentRecord>.Failure($"Instance with ID {instanceId} not found.");
+
+        // WP-4: Validate state transition
+        var transitionError = StateTransitionValidator.ValidateTransition(instance.State, "deploy");
+        if (transitionError != null)
+            return Result<DeploymentRecord>.Failure(transitionError);
+
+        // WP-3: Acquire per-instance operation lock
+        using var lockHandle = await _lockManager.AcquireAsync(
+            instance.UniqueName, _options.OperationLockTimeout, cancellationToken);
+
+        // WP-2: Generate unique deployment ID
+        var deploymentId = Guid.NewGuid().ToString("N");
+
+        // WP-1/16: Flatten configuration (captures template state at this point in time)
+        var flattenResult = await _flatteningPipeline.FlattenAndValidateAsync(instanceId, cancellationToken);
+        if (flattenResult.IsFailure)
+            return Result<DeploymentRecord>.Failure($"Validation failed: {flattenResult.Error}");
+
+        var flattenedConfig = flattenResult.Value.Configuration;
+        var revisionHash = flattenResult.Value.RevisionHash;
+        var validationResult = flattenResult.Value.Validation;
+
+        if (!validationResult.IsValid)
+        {
+            var errors = string.Join("; ", validationResult.Errors.Select(e => e.Message));
+            return Result<DeploymentRecord>.Failure($"Pre-deployment validation failed: {errors}");
+        }
+
+        // Serialize for transmission (also the payload stored in the deployed
+        // snapshot on success / reconciliation).
+        var configJson = JsonSerializer.Serialize(flattenedConfig);
+
+        // DeploymentManager-006: query-the-site-before-redeploy idempotency.
+        // If a prior deployment for this instance is stuck InProgress or Failed
+        // due to a timeout, the site may have actually applied the config. Query
+        // the site for its currently-applied revision before re-sending so a
+        // duplicate deployment is not produced (design: "Deployment Identity &
+        // Idempotency"). A clean prior Success or a fresh first-time deploy
+        // skips this extra round-trip.
+        var reconciled = await TryReconcileWithSiteAsync(
+            instance, revisionHash, configJson, user, cancellationToken);
+        if (reconciled != null)
+            return Result<DeploymentRecord>.Success(reconciled);
+
+        // WP-4: Create the deployment record directly in InProgress.
+        //
+        // DeploymentManager-022: the previous code wrote the record as Pending,
+        // then immediately updated it to InProgress with no work in between
+        // (flattening, validation, and reconciliation all completed above). The
+        // back-to-back write cost an extra SaveChangesAsync round-trip, an
+        // extra IDeploymentStatusNotifier push (CentralUI-006 rendered a
+        // Pending→InProgress flicker for ~ms), and an extra row-version bump
+        // for nothing. The transient Pending slot carried no operational
+        // meaning — it was set and immediately overwritten — so dropping it
+        // collapses the start of the deploy into a single insert + notify.
+        // InProgress remains the documented "sent to site, awaiting response"
+        // state, set immediately before the round-trip below.
+        var record = new DeploymentRecord(deploymentId, user)
+        {
+            InstanceId = instanceId,
+            Status = DeploymentStatus.InProgress,
+            RevisionHash = revisionHash,
+            DeployedAt = DateTimeOffset.UtcNow
+        };
+
+        await _repository.AddDeploymentRecordAsync(record, cancellationToken);
+        await _repository.SaveChangesAsync(cancellationToken);
+        NotifyStatusChange(record);
+
+        try
+        {
+            // WP-1: Send to site via CommunicationService
+            var siteId = await ResolveSiteIdentifierAsync(instance.SiteId, cancellationToken);
+            var command = new DeployInstanceCommand(
+                deploymentId, instance.UniqueName, revisionHash, configJson, user, DateTimeOffset.UtcNow);
+
+            _logger.LogInformation(
+                "Sending deployment {DeploymentId} for instance {Instance} to site {SiteId}",
+                deploymentId, instance.UniqueName, siteId);
+
+            var response = await _communicationService.DeployInstanceAsync(siteId, command, cancellationToken);
+
+            // WP-1: Update status based on site response.
+            record.Status = response.Status;
+            record.ErrorMessage = response.ErrorMessage;
+            record.CompletedAt = DateTimeOffset.UtcNow;
+
+            // DeploymentManager-003: once the site has confirmed the apply,
+            // commit the deployment record's terminal status BEFORE touching
+            // instance state and the deployed-config snapshot. If a later write
+            // (instance update / snapshot store) fails, the recorded fact that
+            // the site succeeded must NOT be lost -- otherwise central reports a
+            // non-Success record while the site is running the new config.
+            await _repository.UpdateDeploymentRecordAsync(record, cancellationToken);
+            await _repository.SaveChangesAsync(cancellationToken);
+            NotifyStatusChange(record);
+
+            if (response.Status == DeploymentStatus.Success)
+            {
+                // The site has applied the deployment. The post-success
+                // persistence below is best-effort: a failure here must be
+                // logged loudly for operator reconciliation but must not flip
+                // the already-committed Success record back to Failed.
+                await ApplyPostSuccessSideEffectsAsync(
+                    instance, deploymentId, revisionHash, configJson,
+                    forceEnabledState: true, cancellationToken);
+            }
+
+            // Audit log
+            await _auditService.LogAsync(user, "Deploy", "Instance", instanceId.ToString(),
+                instance.UniqueName, new { DeploymentId = deploymentId, Status = record.Status.ToString() },
+                cancellationToken);
+
+            _logger.LogInformation(
+                "Deployment {DeploymentId} for instance {Instance}: {Status}",
+                deploymentId, instance.UniqueName, record.Status);
+
+            return record.Status == DeploymentStatus.Success
+                ? Result<DeploymentRecord>.Success(record)
+                : Result<DeploymentRecord>.Failure(
+                    $"Deployment failed: {response.ErrorMessage ?? "Unknown error"}");
+        }
+        catch (Exception ex)
+        {
+            // DeploymentManager-001: any exception out of the try (timeout,
+            // cancellation, transport, serialization, DB) must leave the
+            // deployment record as Failed -- the design requires an interrupted
+            // deployment to be treated as failed, never stuck in InProgress.
+            //
+            // DeploymentManager-002: the failure-status write must NOT use the
+            // operation's cancellation token. If the operation was cancelled or
+            // timed out, that token is already cancelled and the cleanup writes
+            // would themselves throw before the Failed status is persisted.
+            // Use CancellationToken.None so the failure is durably recorded.
+            var isTimeout = ex is TimeoutException or OperationCanceledException;
+
+            record.Status = DeploymentStatus.Failed;
+            record.ErrorMessage = isTimeout
+                ? $"{TimeoutFailurePrefix} {ex.Message}"
+                : $"Deployment error: {ex.Message}";
+            record.CompletedAt = DateTimeOffset.UtcNow;
+
+            try
+            {
+                await _repository.UpdateDeploymentRecordAsync(record, CancellationToken.None);
+                await _repository.SaveChangesAsync(CancellationToken.None);
+                NotifyStatusChange(record);
+
+                await _auditService.LogAsync(user, "DeployFailed", "Instance", instanceId.ToString(),
+                    instance.UniqueName, new { DeploymentId = deploymentId, Error = ex.Message },
+                    CancellationToken.None);
+            }
+            catch (Exception cleanupEx)
+            {
+                // The deployment already failed; a failed cleanup write must not
+                // mask the original error. Log loudly so an operator can reconcile.
+                _logger.LogError(cleanupEx,
+                    "Failed to persist Failed status for deployment {DeploymentId} of instance {Instance} " +
+                    "after deployment error: {Error}",
+                    deploymentId, instance.UniqueName, ex.Message);
+            }
+
+            _logger.LogError(ex,
+                "Deployment {DeploymentId} for instance {Instance} failed",
+                deploymentId, instance.UniqueName);
+
+            return Result<DeploymentRecord>.Failure(
+                isTimeout
+                    ? $"Deployment timed out: {ex.Message}"
+                    : $"Deployment failed: {ex.Message}");
+        }
+    }
+
+    /// <summary>
+    /// WP-6: Disable an instance. Stops Instance Actor, retains config, S&amp;F drains.
+    /// </summary>
+    /// <param name="instanceId">The database ID of the instance to disable.</param>
+    /// <param name="user">The username initiating the operation, recorded in the audit log.</param>
+    /// <param name="cancellationToken">Cancellation token for the operation.</param>
+    public async Task<Result<InstanceLifecycleResponse>> DisableInstanceAsync(
+        int instanceId,
+        string user,
+        CancellationToken cancellationToken = default)
+    {
+        var instance = await _repository.GetInstanceByIdAsync(instanceId, cancellationToken);
+        if (instance == null)
+            return Result<InstanceLifecycleResponse>.Failure($"Instance with ID {instanceId} not found.");
+
+        var transitionError = StateTransitionValidator.ValidateTransition(instance.State, "disable");
+        if (transitionError != null)
+            return Result<InstanceLifecycleResponse>.Failure(transitionError);
+
+        using var lockHandle = await _lockManager.AcquireAsync(
+            instance.UniqueName, _options.OperationLockTimeout, cancellationToken);
+
+        var commandId = Guid.NewGuid().ToString("N");
+        var siteId = await ResolveSiteIdentifierAsync(instance.SiteId, cancellationToken);
+        var command = new DisableInstanceCommand(commandId, instance.UniqueName, DateTimeOffset.UtcNow);
+
+        // WP-6: bound the round-trip with the configured lifecycle timeout so a
+        // hung/unreachable site does not block the operation lock indefinitely.
+        InstanceLifecycleResponse response;
+        try
+        {
+            using var cts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken);
+            cts.CancelAfter(_options.LifecycleCommandTimeout);
+            response = await _communicationService.DisableInstanceAsync(siteId, command, cts.Token);
+        }
+        catch (Exception ex) when (ex is TimeoutException or OperationCanceledException)
+        {
+            // DeploymentManager-019: a lifecycle command timeout produced no
+            // audit row pre-fix — the operator saw a timeout in the UI but
+            // the audit trail showed nothing happened, contrary to the
+            // design's "audit logging for all instance lifecycle changes"
+            // rule. Mirror the DeployFailed pattern: write a "<Action>TimedOut"
+            // entry with CancellationToken.None so a cancelled outer token
+            // (the typical reason this catch ran) cannot prevent the
+            // durable audit write.
+            await TryLogLifecycleTimeoutAsync(
+                user, "DisableTimedOut", instanceId, instance.UniqueName, commandId, ex);
+
+            _logger.LogWarning(ex, "Disable of instance {Instance} timed out", instance.UniqueName);
+            return Result<InstanceLifecycleResponse>.Failure(
+                $"Disable failed: the site did not respond within {_options.LifecycleCommandTimeout}.");
+        }
+
+        if (response.Success)
+        {
+            instance.State = InstanceState.Disabled;
+            await _repository.UpdateInstanceAsync(instance, cancellationToken);
+            await _repository.SaveChangesAsync(cancellationToken);
+        }
+
+        await _auditService.LogAsync(user, "Disable", "Instance", instanceId.ToString(),
+            instance.UniqueName, new { CommandId = commandId, response.Success },
+            cancellationToken);
+
+        return response.Success
+            ? Result<InstanceLifecycleResponse>.Success(response)
+            : Result<InstanceLifecycleResponse>.Failure(response.ErrorMessage ?? "Disable failed.");
+    }
+
+    /// <summary>
+    /// WP-6: Enable an instance. Re-creates Instance Actor from stored config.
+    /// </summary>
+    /// <param name="instanceId">The database ID of the instance to enable.</param>
+    /// <param name="user">The username initiating the operation, recorded in the audit log.</param>
+    /// <param name="cancellationToken">Cancellation token for the operation.</param>
+    public async Task<Result<InstanceLifecycleResponse>> EnableInstanceAsync(
+        int instanceId,
+        string user,
+        CancellationToken cancellationToken = default)
+    {
+        var instance = await _repository.GetInstanceByIdAsync(instanceId, cancellationToken);
+        if (instance == null)
+            return Result<InstanceLifecycleResponse>.Failure($"Instance with ID {instanceId} not found.");
+
+        var transitionError = StateTransitionValidator.ValidateTransition(instance.State, "enable");
+        if (transitionError != null)
+            return Result<InstanceLifecycleResponse>.Failure(transitionError);
+
+        using var lockHandle = await _lockManager.AcquireAsync(
+            instance.UniqueName, _options.OperationLockTimeout, cancellationToken);
+
+        var commandId = Guid.NewGuid().ToString("N");
+        var siteId = await ResolveSiteIdentifierAsync(instance.SiteId, cancellationToken);
+        var command = new EnableInstanceCommand(commandId, instance.UniqueName, DateTimeOffset.UtcNow);
+
+        // WP-6: bound the round-trip with the configured lifecycle timeout.
+        InstanceLifecycleResponse response;
+        try
+        {
+            using var cts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken);
+            cts.CancelAfter(_options.LifecycleCommandTimeout);
+            response = await _communicationService.EnableInstanceAsync(siteId, command, cts.Token);
+        }
+        catch (Exception ex) when (ex is TimeoutException or OperationCanceledException)
+        {
+            // DeploymentManager-019: emit an audit entry on lifecycle timeout
+            // so the operator's attempted Enable is recorded; see the matching
+            // comment in DisableInstanceAsync for the full rationale.
+            await TryLogLifecycleTimeoutAsync(
+                user, "EnableTimedOut", instanceId, instance.UniqueName, commandId, ex);
+
+            _logger.LogWarning(ex, "Enable of instance {Instance} timed out", instance.UniqueName);
+            return Result<InstanceLifecycleResponse>.Failure(
+                $"Enable failed: the site did not respond within {_options.LifecycleCommandTimeout}.");
+        }
+
+        if (response.Success)
+        {
+            instance.State = InstanceState.Enabled;
+            await _repository.UpdateInstanceAsync(instance, cancellationToken);
+            await _repository.SaveChangesAsync(cancellationToken);
+        }
+
+        await _auditService.LogAsync(user, "Enable", "Instance", instanceId.ToString(),
+            instance.UniqueName, new { CommandId = commandId, response.Success },
+            cancellationToken);
+
+        return response.Success
+            ? Result<InstanceLifecycleResponse>.Success(response)
+            : Result<InstanceLifecycleResponse>.Failure(response.ErrorMessage ?? "Enable failed.");
+    }
+
+    /// <summary>
+    /// WP-6: Delete an instance. Stops the site actor, removes site config, and
+    /// removes the central instance record (deployment history, snapshot,
+    /// overrides, and connection bindings go with it). S&amp;F NOT cleared.
+    /// Delete fails if the site is unreachable within
+    /// <c>CommunicationOptions.LifecycleTimeout</c> (applied inside
+    /// <see cref="CommunicationService.DeleteInstanceAsync"/>).
+    /// </summary>
+    /// <param name="instanceId">The database ID of the instance to delete.</param>
+    /// <param name="user">The username initiating the deletion, recorded in the audit log.</param>
+    /// <param name="cancellationToken">Cancellation token for the operation.</param>
+    public async Task<Result<InstanceLifecycleResponse>> DeleteInstanceAsync(
+        int instanceId,
+        string user,
+        CancellationToken cancellationToken = default)
+    {
+        var instance = await _repository.GetInstanceByIdAsync(instanceId, cancellationToken);
+        if (instance == null)
+            return Result<InstanceLifecycleResponse>.Failure($"Instance with ID {instanceId} not found.");
+
+        var transitionError = StateTransitionValidator.ValidateTransition(instance.State, "delete");
+        if (transitionError != null)
+            return Result<InstanceLifecycleResponse>.Failure(transitionError);
+
+        using var lockHandle = await _lockManager.AcquireAsync(
+            instance.UniqueName, _options.OperationLockTimeout, cancellationToken);
+
+        var commandId = Guid.NewGuid().ToString("N");
+        var siteId = await ResolveSiteIdentifierAsync(instance.SiteId, cancellationToken);
+        var command = new DeleteInstanceCommand(commandId, instance.UniqueName, DateTimeOffset.UtcNow);
+
+        // WP-6: bound the round-trip with the configured lifecycle timeout.
+        InstanceLifecycleResponse response;
+        try
+        {
+            using var cts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken);
+            cts.CancelAfter(_options.LifecycleCommandTimeout);
+            response = await _communicationService.DeleteInstanceAsync(siteId, command, cts.Token);
+        }
+        catch (Exception ex) when (ex is TimeoutException or OperationCanceledException)
+        {
+            // DeploymentManager-019: emit an audit entry on lifecycle timeout
+            // so the operator's attempted Delete is recorded; see the matching
+            // comment in DisableInstanceAsync for the full rationale.
+            await TryLogLifecycleTimeoutAsync(
+                user, "DeleteTimedOut", instanceId, instance.UniqueName, commandId, ex);
+
+            _logger.LogWarning(ex, "Delete of instance {Instance} timed out", instance.UniqueName);
+            return Result<InstanceLifecycleResponse>.Failure(
+                $"Delete failed: the site did not respond within {_options.LifecycleCommandTimeout}.");
+        }
+
+        if (response.Success)
+        {
+            // Delete means delete: remove the instance record entirely.
+            // Deployment records, snapshot, overrides, and connection bindings
+            // are removed with it (see repository implementation).
+            //
+            // DeploymentManager-004: the site has already destroyed the Instance
+            // Actor and removed its config. If the central record removal now
+            // fails (DB error / concurrency), the exception must NOT escape
+            // uncaught -- that would leave the central record orphaned and
+            // un-deletable through the normal path (a re-issued delete may fail
+            // because the site no longer has the instance). Surface a distinct
+            // failure so an operator can reconcile.
+            try
+            {
+                await _repository.DeleteInstanceAsync(instanceId, cancellationToken);
+                await _repository.SaveChangesAsync(cancellationToken);
+            }
+            catch (Exception ex)
+            {
+                _logger.LogError(ex,
+                    "Instance {Instance} was deleted at the site, but the central record could not be " +
+                    "removed -- the central record is now orphaned and must be reconciled manually",
+                    instance.UniqueName);
+
+                await _auditService.LogAsync(user, "DeleteOrphaned", "Instance", instanceId.ToString(),
+                    instance.UniqueName, new { CommandId = commandId, Error = ex.Message },
+                    CancellationToken.None);
+
+                return Result<InstanceLifecycleResponse>.Failure(
+                    $"The site deleted instance '{instance.UniqueName}', but the central record could not " +
+                    $"be removed: {ex.Message}. The central record is orphaned and must be reconciled.");
+            }
+        }
+
+        await _auditService.LogAsync(user, "Delete", "Instance", instanceId.ToString(),
+            instance.UniqueName, new { CommandId = commandId, response.Success },
+            cancellationToken);
+
+        return response.Success
+            ? Result<InstanceLifecycleResponse>.Success(response)
+            : Result<InstanceLifecycleResponse>.Failure(
+                response.ErrorMessage ?? "Delete failed. Site may be unreachable.");
+    }
+
+    /// <summary>
+    /// WP-8: Get the deployed config snapshot and compare with current
+    /// template-derived state. Produces both a staleness flag and — per the
+    /// design's "Diff View" — a structured <see cref="ConfigurationDiff"/> of
+    /// added/removed/changed attributes, alarms, and scripts (including data
+    /// connection binding changes) computed by the TemplateEngine
+    /// <see cref="DiffService"/>.
+    /// </summary>
+    /// <param name="instanceId">The database ID of the instance to compare.</param>
+    /// <param name="cancellationToken">Cancellation token for the operation.</param>
+    public async Task<Result<DeploymentComparisonResult>> GetDeploymentComparisonAsync(
+        int instanceId,
+        CancellationToken cancellationToken = default)
+    {
+        var snapshot = await _repository.GetDeployedSnapshotByInstanceIdAsync(instanceId, cancellationToken);
+        if (snapshot == null)
+            return Result<DeploymentComparisonResult>.Failure("No deployed snapshot found for this instance.");
+
+        // Compute current template-derived config
+        var currentResult = await _flatteningPipeline.FlattenAndValidateAsync(instanceId, cancellationToken);
+        if (currentResult.IsFailure)
+            return Result<DeploymentComparisonResult>.Failure($"Cannot compute current config: {currentResult.Error}");
+
+        var currentConfig = currentResult.Value.Configuration;
+        var currentHash = currentResult.Value.RevisionHash;
+        var isStale = snapshot.RevisionHash != currentHash;
+
+        // DeploymentManager-007: deserialize the deployed snapshot and run the
+        // TemplateEngine DiffService so the result carries real
+        // added/removed/changed detail, not just a hash comparison. A snapshot
+        // that cannot be deserialized (corrupt / older schema) still yields the
+        // hash-based staleness result, with a null diff.
+        ConfigurationDiff? diff = null;
+        try
+        {
+            var deployedConfig = JsonSerializer.Deserialize<FlattenedConfiguration>(snapshot.ConfigurationJson);
+            if (deployedConfig != null)
+            {
+                diff = _diffService.ComputeDiff(
+                    deployedConfig, currentConfig, snapshot.RevisionHash, currentHash);
+            }
+            else
+            {
+                _logger.LogWarning(
+                    "Deployed snapshot for instance {InstanceId} deserialized to null; " +
+                    "returning hash-based comparison without a structured diff",
+                    instanceId);
+            }
+        }
+        catch (JsonException ex)
+        {
+            _logger.LogWarning(ex,
+                "Could not deserialize deployed snapshot for instance {InstanceId}; " +
+                "returning hash-based comparison without a structured diff",
+                instanceId);
+        }
+
+        var result = new DeploymentComparisonResult(
+            instanceId,
+            snapshot.RevisionHash,
+            currentHash,
+            isStale,
+            snapshot.DeployedAt,
+            diff);
+
+        return Result<DeploymentComparisonResult>.Success(result);
+    }
+
+    /// <summary>
+    /// WP-2: Returns the current persisted <see cref="DeploymentRecord"/> for
+    /// the given deployment ID from the configuration database. This is a pure
+    /// local DB read — it does not contact the site. The query-the-site-before-
+    /// redeploy reconciliation (design: "Deployment Identity &amp; Idempotency")
+    /// lives in <see cref="TryReconcileWithSiteAsync"/>, which
+    /// <see cref="DeployInstanceAsync"/> invokes on the deploy path.
+    /// </summary>
+    /// <param name="deploymentId">The unique deployment identifier to look up.</param>
+    /// <param name="cancellationToken">Cancellation token for the operation.</param>
+    public async Task<DeploymentRecord?> GetDeploymentStatusAsync(
+        string deploymentId,
+        CancellationToken cancellationToken = default)
+    {
+        return await _repository.GetDeploymentByDeploymentIdAsync(deploymentId, cancellationToken);
+    }
+
+    /// <summary>
+    /// DeploymentManager-006: query-the-site-before-redeploy reconciliation.
+    ///
+    /// The site query is issued ONLY when a prior <see cref="DeploymentRecord"/>
+    /// for this instance is stuck <see cref="DeploymentStatus.InProgress"/>, or
+    /// is <see cref="DeploymentStatus.Failed"/> due to a timeout — the only
+    /// cases where the site may have applied the config without central
+    /// learning of it. Fresh first-time deploys and redeploys after a clean
+    /// prior <see cref="DeploymentStatus.Success"/> skip the extra round-trip.
+    ///
+    /// Reconciliation: if the site already has the TARGET revision hash, the
+    /// prior record is marked <see cref="DeploymentStatus.Success"/> (with its
+    /// <see cref="DeploymentRecord.RevisionHash"/> corrected to the target —
+    /// DeploymentManager-016) and returned (the caller must NOT re-send the
+    /// deploy). The same post-success side effects as the normal deploy path
+    /// are applied — instance <see cref="InstanceState.Enabled"/> and a stored
+    /// <see cref="DeployedConfigSnapshot"/> (DeploymentManager-015) — so central
+    /// and site state do not diverge. Otherwise <c>null</c> is returned and the
+    /// normal deploy proceeds.
+    ///
+    /// Query failure: if the site is unreachable or the query times out, this
+    /// returns <c>null</c> (fall through to a normal deploy) — site-side
+    /// stale-rejection of an older revision hash is the safety net. The deploy
+    /// is never aborted on a failed query.
+    /// </summary>
+    private async Task<DeploymentRecord?> TryReconcileWithSiteAsync(
+        Instance instance,
+        string targetRevisionHash,
+        string configJson,
+        string currentUser,
+        CancellationToken cancellationToken)
+    {
+        var prior = await _repository.GetCurrentDeploymentStatusAsync(instance.Id, cancellationToken);
+        if (prior == null || !ShouldQuerySiteBeforeRedeploy(prior))
+            return null;
+
+        DeploymentStateQueryResponse response;
+        try
+        {
+            var siteId = await ResolveSiteIdentifierAsync(instance.SiteId, cancellationToken);
+            var query = new DeploymentStateQueryRequest(
+                Guid.NewGuid().ToString("N"), instance.UniqueName, DateTimeOffset.UtcNow);
+
+            _logger.LogInformation(
+                "Querying site {SiteId} for applied deployment state of instance {Instance} " +
+                "before re-deploy (prior record {DeploymentId} is {Status})",
+                siteId, instance.UniqueName, prior.DeploymentId, prior.Status);
+
+            response = await _communicationService.QueryDeploymentStateAsync(
+                siteId, query, cancellationToken);
+        }
+        catch (Exception ex)
+        {
+            // Query failure (site unreachable / timeout): do NOT abort. Fall
+            // through to a normal deploy; site-side stale-rejection of an older
+            // revision hash is the safety net.
+            _logger.LogWarning(ex,
+                "Site query before re-deploy of instance {Instance} failed; " +
+                "proceeding with normal deploy (site-side stale-rejection is the safety net)",
+                instance.UniqueName);
+            return null;
+        }
+
+        if (response.IsDeployed &&
+            string.Equals(response.AppliedRevisionHash, targetRevisionHash, StringComparison.Ordinal))
+        {
+            // The site already has the target revision — the prior deployment
+            // actually succeeded. Reconcile the stale record instead of
+            // re-sending the deploy.
+            _logger.LogInformation(
+                "Site already has target revision {RevisionHash} for instance {Instance}; " +
+                "marking prior deployment record {DeploymentId} Success without re-deploying",
+                targetRevisionHash, instance.UniqueName, prior.DeploymentId);
+
+            prior.Status = DeploymentStatus.Success;
+            prior.ErrorMessage = null;
+            prior.CompletedAt = DateTimeOffset.UtcNow;
+            // DeploymentManager-016: the prior record can legitimately carry a
+            // different (stale) revision hash than the current target. The site
+            // confirmed it is running the target revision, so the persisted
+            // record, the audit entry below, and the site must all agree.
+            prior.RevisionHash = targetRevisionHash;
+            await _repository.UpdateDeploymentRecordAsync(prior, cancellationToken);
+            await _repository.SaveChangesAsync(cancellationToken);
+            NotifyStatusChange(prior);
+
+            // DeploymentManager-015: a reconciled deployment must perform the
+            // SAME post-success side effects as the normal deploy path — set
+            // the instance State to Enabled and store/refresh the deployed
+            // config snapshot — otherwise the central state machine and the
+            // deployed-snapshot invariant diverge from what the site is running.
+            //
+            // DeploymentManager-018: the reconciliation path runs only when the
+            // prior record is InProgress or timeout-Failed — exactly the cases
+            // that survive a central failover. The in-memory operation lock is
+            // lost on failover, so an operator may have legitimately invoked
+            // Disable on the instance between the original timed-out deploy and
+            // this redeploy. Disable does not change the deployed config, so the
+            // site still reports the target revision hash. Reconciliation must
+            // therefore PRESERVE an intentional Disabled state instead of
+            // silently flipping it back to Enabled — pass forceEnabledState:
+            // false so the helper only promotes NotDeployed → Enabled (the
+            // first-deploy-timed-out case) and leaves an explicit Disabled
+            // alone.
+            await ApplyPostSuccessSideEffectsAsync(
+                instance, prior.DeploymentId, targetRevisionHash, configJson,
+                forceEnabledState: false, cancellationToken);
+
+            // DeploymentManager-020: attribute the audit row to the user driving
+            // THIS redeploy (the caller of DeployInstanceAsync), not the user
+            // who issued the original timed-out / stuck deployment. The original
+            // deployer is preserved in the detail object so forensics can still
+            // see who launched the run that reconciliation rescued.
+            await _auditService.LogAsync(currentUser, "DeployReconciled", "Instance",
+                instance.Id.ToString(), instance.UniqueName,
+                new
+                {
+                    DeploymentId = prior.DeploymentId,
+                    RevisionHash = targetRevisionHash,
+                    OriginalDeployer = prior.DeployedBy
+                },
+                cancellationToken);
+
+            return prior;
+        }
+
+        // Site does not have the target revision (or is not deployed) — proceed
+        // with the normal deploy.
+        return null;
+    }
+
+    /// <summary>
+    /// DeploymentManager-006: the site is queried before a re-deploy only when a
+    /// prior record is stuck <see cref="DeploymentStatus.InProgress"/>, or is
+    /// <see cref="DeploymentStatus.Failed"/> because the site command timed out
+    /// (detected via the <see cref="TimeoutFailurePrefix"/> error-message
+    /// marker). All other prior states skip the query.
+    /// </summary>
+    private static bool ShouldQuerySiteBeforeRedeploy(DeploymentRecord prior) =>
+        prior.Status == DeploymentStatus.InProgress
+        || (prior.Status == DeploymentStatus.Failed
+            && prior.ErrorMessage != null
+            && prior.ErrorMessage.StartsWith(TimeoutFailurePrefix, StringComparison.Ordinal));
+
+    /// <summary>
+    /// Post-success side effects shared by the normal deploy path and the
+    /// DeploymentManager-006 reconciliation path: set the instance
+    /// <see cref="InstanceState.Enabled"/> (WP-4) and store/refresh the
+    /// deployed config snapshot (WP-8). Factored into one helper so the two
+    /// paths cannot drift (DeploymentManager-015).
+    ///
+    /// DeploymentManager-018: <paramref name="forceEnabledState"/> distinguishes
+    /// the two callers. The normal deploy path passes <c>true</c> — a fresh
+    /// successful apply legitimately puts the instance into <see cref="InstanceState.Enabled"/>
+    /// (the documented "Deploy on a Disabled instance also enables it" semantics
+    /// of <see cref="StateTransitionValidator"/>). The reconciliation path
+    /// passes <c>false</c>: it is reconciling a *prior* deployment that may
+    /// have completed before the current operator session (central failover
+    /// loses the in-memory operation lock, so an operator may have legitimately
+    /// Disabled the instance in between). On that path we only promote
+    /// <see cref="InstanceState.NotDeployed"/> → <see cref="InstanceState.Enabled"/>
+    /// (the first-deploy-timed-out case) and leave an explicit Disabled alone,
+    /// so reconciliation never silently undoes a Disable.
+    ///
+    /// Best-effort: the deployment record's terminal <see cref="DeploymentStatus.Success"/>
+    /// status is already committed by the caller before this runs. A failure
+    /// here is logged loudly for operator reconciliation but is NOT propagated —
+    /// it must not flip the already-committed Success record back to Failed.
+    /// </summary>
+    private async Task ApplyPostSuccessSideEffectsAsync(
+        Instance instance,
+        string deploymentId,
+        string revisionHash,
+        string configJson,
+        bool forceEnabledState,
+        CancellationToken cancellationToken)
+    {
+        try
+        {
+            // WP-4: Update instance state to Enabled on successful deployment.
+            // DeploymentManager-018: on the reconciliation path
+            // (forceEnabledState=false) only promote NotDeployed → Enabled,
+            // preserving an intentional Disabled state set between the original
+            // timed-out deploy and the redeploy.
+            if (forceEnabledState || instance.State == InstanceState.NotDeployed)
+            {
+                instance.State = InstanceState.Enabled;
+            }
+            await _repository.UpdateInstanceAsync(instance, cancellationToken);
+
+            // WP-8: Store deployed config snapshot
+            await StoreDeployedSnapshotAsync(
+                instance.Id, deploymentId, revisionHash, configJson, cancellationToken);
+
+            await _repository.SaveChangesAsync(cancellationToken);
+        }
+        catch (Exception postEx)
+        {
+            _logger.LogError(postEx,
+                "Deployment {DeploymentId} for instance {Instance} was applied by the site and " +
+                "recorded Success, but post-success persistence (instance state / config snapshot) " +
+                "failed -- central and site state may diverge until reconciled",
+                deploymentId, instance.UniqueName);
+        }
+    }
+
+    /// <summary>
+    /// DeploymentManager-019: write a "&lt;Action&gt;TimedOut" audit entry on
+    /// behalf of a lifecycle command (Disable / Enable / Delete) whose site
+    /// round-trip exceeded <see cref="DeploymentManagerOptions.LifecycleCommandTimeout"/>.
+    ///
+    /// <para>
+    /// Mirrors the <c>DeployFailed</c> pattern in
+    /// <see cref="DeployInstanceAsync"/>: the audit write uses
+    /// <see cref="CancellationToken.None"/> so the operator's outer cancellation
+    /// (the usual reason this path runs) cannot also prevent the audit row from
+    /// being persisted. The detail object carries the lifecycle command id, the
+    /// timeout that fired, and the original exception message so an operator can
+    /// correlate the audit entry with the UI-surfaced timeout error.
+    /// </para>
+    ///
+    /// <para>
+    /// Wrapped in try/catch — a failed audit write must NOT mask the underlying
+    /// timeout from the caller; it is logged at Warning so the operator can
+    /// reconcile but never thrown.
+    /// </para>
+    /// </summary>
+    /// <param name="user">The username who initiated the lifecycle command.</param>
+    /// <param name="action">The audit action name (<c>DisableTimedOut</c>, <c>EnableTimedOut</c>, or <c>DeleteTimedOut</c>).</param>
+    /// <param name="instanceId">The numeric instance id, recorded on the audit row.</param>
+    /// <param name="instanceUniqueName">The instance unique name used as the audit target name.</param>
+    /// <param name="commandId">The lifecycle command's correlation id, so the audit entry can be matched to logs.</param>
+    /// <param name="timeoutException">The captured <see cref="TimeoutException"/> or <see cref="OperationCanceledException"/>.</param>
+    private async Task TryLogLifecycleTimeoutAsync(
+        string user,
+        string action,
+        int instanceId,
+        string instanceUniqueName,
+        string commandId,
+        Exception timeoutException)
+    {
+        try
+        {
+            await _auditService.LogAsync(
+                user,
+                action,
+                "Instance",
+                instanceId.ToString(),
+                instanceUniqueName,
+                new
+                {
+                    CommandId = commandId,
+                    Deadline = _options.LifecycleCommandTimeout,
+                    Error = timeoutException.Message,
+                },
+                CancellationToken.None);
+        }
+        catch (Exception auditEx)
+        {
+            // A failed audit write must not bury the timeout for the caller —
+            // just log so an operator can investigate the audit-pipeline issue.
+            _logger.LogWarning(auditEx,
+                "Failed to write {Action} audit entry for instance {Instance} (commandId={CommandId})",
+                action, instanceUniqueName, commandId);
+        }
+    }
+
+    private async Task StoreDeployedSnapshotAsync(
+        int instanceId,
+        string deploymentId,
+        string revisionHash,
+        string configJson,
+        CancellationToken cancellationToken)
+    {
+        var existing = await _repository.GetDeployedSnapshotByInstanceIdAsync(instanceId, cancellationToken);
+        if (existing != null)
+        {
+            existing.DeploymentId = deploymentId;
+            existing.RevisionHash = revisionHash;
+            existing.ConfigurationJson = configJson;
+            existing.DeployedAt = DateTimeOffset.UtcNow;
+            await _repository.UpdateDeployedSnapshotAsync(existing, cancellationToken);
+        }
+        else
+        {
+            var snapshot = new DeployedConfigSnapshot(deploymentId, revisionHash, configJson)
+            {
+                InstanceId = instanceId,
+                DeployedAt = DateTimeOffset.UtcNow
+            };
+            await _repository.AddDeployedSnapshotAsync(snapshot, cancellationToken);
+        }
+    }
+}
+
+/// <summary>
+/// WP-8: Result of comparing deployed vs template-derived configuration.
+/// </summary>
+/// <param name="Diff">
+/// DeploymentManager-007: structured added/removed/changed detail for
+/// attributes, alarms, and scripts. Null only when the deployed snapshot could
+/// not be deserialized (corrupt / older schema), in which case
+/// <see cref="IsStale"/> still reflects the hash comparison.
+/// </param>
+public record DeploymentComparisonResult(
+    int InstanceId,
+    string DeployedRevisionHash,
+    string CurrentRevisionHash,
+    bool IsStale,
+    DateTimeOffset DeployedAt,
+    ConfigurationDiff? Diff = null);