using System.Text.Json; using Microsoft.Extensions.Logging; using Microsoft.Extensions.Options; using ZB.MOM.WW.ScadaBridge.Commons.Entities.Deployment; using ZB.MOM.WW.ScadaBridge.Commons.Entities.Instances; using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Repositories; using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Services; using ZB.MOM.WW.ScadaBridge.Commons.Messages.Deployment; using ZB.MOM.WW.ScadaBridge.Commons.Messages.Lifecycle; using ZB.MOM.WW.ScadaBridge.Commons.Observability; using ZB.MOM.WW.ScadaBridge.Commons.Types; using ZB.MOM.WW.ScadaBridge.Commons.Types.Enums; using ZB.MOM.WW.ScadaBridge.Commons.Types.Flattening; using ZB.MOM.WW.ScadaBridge.Communication; using ZB.MOM.WW.ScadaBridge.TemplateEngine.Flattening; using ZB.MOM.WW.ScadaBridge.TemplateEngine.Validation; namespace ZB.MOM.WW.ScadaBridge.DeploymentManager; /// /// WP-1: Central-side deployment orchestration service. /// Coordinates the full deployment pipeline: /// 1. Validate instance state transition (WP-4) /// 2. Acquire per-instance operation lock (WP-3) /// 3. Flatten configuration via TemplateEngine (captures template state at time of flatten -- WP-16) /// 4. Validate flattened configuration /// 5. Compute revision hash and diff /// 6. Send DeployInstanceCommand to site via CommunicationService /// 7. Track deployment status with optimistic concurrency (WP-4) /// 8. Store deployed config snapshot (WP-8) /// 9. Audit log all actions /// /// WP-2: Each deployment has a unique deployment ID (GUID) + revision hash. /// WP-16: Template state captured at flatten time -- last-write-wins on templates is safe. /// public class DeploymentService { private readonly IDeploymentManagerRepository _repository; private readonly ISiteRepository _siteRepository; private readonly IFlatteningPipeline _flatteningPipeline; private readonly CommunicationService _communicationService; private readonly OperationLockManager _lockManager; private readonly IAuditService _auditService; private readonly DiffService _diffService; private readonly RevisionHashService _revisionHashService; private readonly IDeploymentStatusNotifier _statusNotifier; private readonly DeploymentManagerOptions _options; private readonly ILogger _logger; /// /// Prefix written to when a /// deployment fails because the site command timed out or was cancelled. /// Used by the query-before-redeploy trigger (DeploymentManager-006) to tell /// a timeout-induced failure apart from other deployment errors. /// private const string TimeoutFailurePrefix = "Communication failure:"; /// /// Initializes a new instance of with all required dependencies. /// /// Repository for deployment manager data access. /// Repository for site data access. /// Pipeline for flattening and validating template configurations. /// Service for cross-cluster communication with sites. /// Manager for per-instance operation locks. /// Service for recording audit log entries. /// Service for computing configuration diffs. /// /// Service for recomputing a flattened configuration's revision hash. Used by /// to derive the deployed-side /// staleness hash from the (List-normalized) deserialized snapshot — see I-1. /// /// Notifier for pushing deployment status changes to the UI. /// Deployment manager configuration options. /// Logger instance. public DeploymentService( IDeploymentManagerRepository repository, ISiteRepository siteRepository, IFlatteningPipeline flatteningPipeline, CommunicationService communicationService, OperationLockManager lockManager, IAuditService auditService, DiffService diffService, RevisionHashService revisionHashService, IDeploymentStatusNotifier statusNotifier, IOptions options, ILogger logger) { _repository = repository; _siteRepository = siteRepository; _flatteningPipeline = flatteningPipeline; _communicationService = communicationService; _lockManager = lockManager; _auditService = auditService; _diffService = diffService; _revisionHashService = revisionHashService; _statusNotifier = statusNotifier; _options = options.Value; _logger = logger; } /// /// CentralUI-006: raises a push notification that a deployment record's /// status was just persisted, so the Central UI deployment-status page can /// re-render over its SignalR circuit instead of polling. Called at every /// point a status is written. /// private void NotifyStatusChange(DeploymentRecord record) => _statusNotifier.NotifyStatusChanged( new DeploymentStatusChange(record.DeploymentId, record.InstanceId, record.Status)); /// /// Resolves the site's string identifier from the numeric DB ID. /// The communication layer routes by string identifier (e.g. "site-a"), not DB ID. /// /// DeploymentManager-021: when the row is missing (FK was /// deleted, race with admin delete, DB inconsistency) the previous behaviour /// silently substituted the numeric id rendered as a string — every /// downstream `CommunicationService` call then failed with a confusing /// "unknown site" routing error that hid the real cause. Treat a missing /// site row as a hard validation failure: throw /// naming the unresolved id so the /// operator sees the actual problem. On the deploy path the existing /// try/catch turns this into a Failed deployment record with a clear /// message; lifecycle paths propagate it to the caller (CLI/UI) which /// surface it as an error to the operator. /// private async Task ResolveSiteIdentifierAsync(int siteId, CancellationToken cancellationToken) { var site = await _siteRepository.GetSiteByIdAsync(siteId, cancellationToken); if (site == null) throw new InvalidOperationException( $"Site with ID {siteId} not found; cannot resolve its SiteIdentifier for routing."); return site.SiteIdentifier; } /// /// WP-1: Deploy an instance to its site. /// WP-2: Generates unique deployment ID, computes revision hash. /// WP-4: Validates state transitions, uses optimistic concurrency. /// WP-5: Site-side apply is all-or-nothing (handled by DeploymentManagerActor). /// WP-8: Stores deployed config snapshot on success. /// WP-16: Captures template state at time of flatten. /// /// The database ID of the instance to deploy. /// The username initiating the deployment, recorded in the audit log. /// Cancellation token for the operation. /// A task that resolves to a success result containing the deployment record, or a failure result with an error message. public async Task> DeployInstanceAsync( int instanceId, string user, CancellationToken cancellationToken = default) { // Load instance var instance = await _repository.GetInstanceByIdAsync(instanceId, cancellationToken); if (instance == null) return Result.Failure($"Instance with ID {instanceId} not found."); // WP-4: Validate state transition var transitionError = StateTransitionValidator.ValidateTransition(instance.State, "deploy"); if (transitionError != null) return Result.Failure(transitionError); // WP-3: Acquire per-instance operation lock using var lockHandle = await _lockManager.AcquireAsync( instance.UniqueName, _options.OperationLockTimeout, cancellationToken); // WP-2: Generate unique deployment ID var deploymentId = Guid.NewGuid().ToString("N"); // WP-1/16: Flatten configuration (captures template state at this point in time) var flattenResult = await _flatteningPipeline.FlattenAndValidateAsync(instanceId, cancellationToken); if (flattenResult.IsFailure) return Result.Failure($"Validation failed: {flattenResult.Error}"); var flattenedConfig = flattenResult.Value.Configuration; var revisionHash = flattenResult.Value.RevisionHash; var validationResult = flattenResult.Value.Validation; if (!validationResult.IsValid) { // Followup #8: return a grouped/summarized error (leading count + per-module // rollup, capped) instead of a flat semicolon-joined dump that becomes a wall // of text for instances with dozens of unbound attributes. The full per-entry // list still goes to the deploy log for operators who need every clause. _logger.LogWarning( "Pre-deployment validation failed for instance {InstanceId} ({ErrorCount} error(s)): {Detail}", instanceId, validationResult.Errors.Count, string.Join("; ", validationResult.Errors.Select(e => e.Message))); return Result.Failure( $"Pre-deployment validation failed: {validationResult.SummarizeErrors()}"); } // Serialize for transmission (also the payload stored in the deployed // snapshot on success / reconciliation). var configJson = JsonSerializer.Serialize(flattenedConfig); // DeploymentManager-006: query-the-site-before-redeploy idempotency. // If a prior deployment for this instance is stuck InProgress or Failed // due to a timeout, the site may have actually applied the config. Query // the site for its currently-applied revision before re-sending so a // duplicate deployment is not produced (design: "Deployment Identity & // Idempotency"). A clean prior Success or a fresh first-time deploy // skips this extra round-trip. var reconciled = await TryReconcileWithSiteAsync( instance, revisionHash, configJson, user, cancellationToken); if (reconciled != null) return Result.Success(reconciled); // WP-4: Create the deployment record directly in InProgress. // // DeploymentManager-022: the previous code wrote the record as Pending, // then immediately updated it to InProgress with no work in between // (flattening, validation, and reconciliation all completed above). The // back-to-back write cost an extra SaveChangesAsync round-trip, an // extra IDeploymentStatusNotifier push (CentralUI-006 rendered a // Pending→InProgress flicker for ~ms), and an extra row-version bump // for nothing. The transient Pending slot carried no operational // meaning — it was set and immediately overwritten — so dropping it // collapses the start of the deploy into a single insert + notify. // InProgress remains the documented "sent to site, awaiting response" // state, set immediately before the round-trip below. var record = new DeploymentRecord(deploymentId, user) { InstanceId = instanceId, Status = DeploymentStatus.InProgress, RevisionHash = revisionHash, DeployedAt = DateTimeOffset.UtcNow }; await _repository.AddDeploymentRecordAsync(record, cancellationToken); await _repository.SaveChangesAsync(cancellationToken); NotifyStatusChange(record); try { // WP-1: Send to site via CommunicationService var siteId = await ResolveSiteIdentifierAsync(instance.SiteId, cancellationToken); var command = new DeployInstanceCommand( deploymentId, instance.UniqueName, revisionHash, configJson, user, DateTimeOffset.UtcNow); _logger.LogInformation( "Sending deployment {DeploymentId} for instance {Instance} to site {SiteId}", deploymentId, instance.UniqueName, siteId); var response = await _communicationService.DeployInstanceAsync(siteId, command, cancellationToken); // WP-1: Update status based on site response. record.Status = response.Status; record.ErrorMessage = response.ErrorMessage; record.CompletedAt = DateTimeOffset.UtcNow; // DeploymentManager-003: once the site has confirmed the apply, // commit the deployment record's terminal status BEFORE touching // instance state and the deployed-config snapshot. If a later write // (instance update / snapshot store) fails, the recorded fact that // the site succeeded must NOT be lost -- otherwise central reports a // non-Success record while the site is running the new config. await _repository.UpdateDeploymentRecordAsync(record, cancellationToken); await _repository.SaveChangesAsync(cancellationToken); NotifyStatusChange(record); if (response.Status == DeploymentStatus.Success) { // Telemetry: one instance deployment successfully applied to a // site. Counted once per successful deploy operation (the unit // of scadabridge.deployments.applied — one DeployInstanceAsync // deploys exactly one instance to one site). Emitted only on this // confirmed-Success path, so failures, timeouts/retries (the // catch block), and the reconciliation path (which recovers a // PRIOR timed-out apply rather than performing a fresh one) do // not increment it. ScadaBridgeTelemetry.RecordDeploymentApplied(); // The site has applied the deployment. The post-success // persistence below is best-effort: a failure here must be // logged loudly for operator reconciliation but must not flip // the already-committed Success record back to Failed. await ApplyPostSuccessSideEffectsAsync( instance, deploymentId, revisionHash, configJson, forceEnabledState: true, cancellationToken); } // Audit log await _auditService.LogAsync(user, "Deploy", "Instance", instanceId.ToString(), instance.UniqueName, new { DeploymentId = deploymentId, Status = record.Status.ToString() }, cancellationToken); _logger.LogInformation( "Deployment {DeploymentId} for instance {Instance}: {Status}", deploymentId, instance.UniqueName, record.Status); return record.Status == DeploymentStatus.Success ? Result.Success(record) : Result.Failure( $"Deployment failed: {response.ErrorMessage ?? "Unknown error"}"); } catch (Exception ex) { // DeploymentManager-001: any exception out of the try (timeout, // cancellation, transport, serialization, DB) must leave the // deployment record as Failed -- the design requires an interrupted // deployment to be treated as failed, never stuck in InProgress. // // DeploymentManager-002: the failure-status write must NOT use the // operation's cancellation token. If the operation was cancelled or // timed out, that token is already cancelled and the cleanup writes // would themselves throw before the Failed status is persisted. // Use CancellationToken.None so the failure is durably recorded. var isTimeout = ex is TimeoutException or OperationCanceledException; record.Status = DeploymentStatus.Failed; record.ErrorMessage = isTimeout ? $"{TimeoutFailurePrefix} {ex.Message}" : $"Deployment error: {ex.Message}"; record.CompletedAt = DateTimeOffset.UtcNow; try { await _repository.UpdateDeploymentRecordAsync(record, CancellationToken.None); await _repository.SaveChangesAsync(CancellationToken.None); NotifyStatusChange(record); await _auditService.LogAsync(user, "DeployFailed", "Instance", instanceId.ToString(), instance.UniqueName, new { DeploymentId = deploymentId, Error = ex.Message }, CancellationToken.None); } catch (Exception cleanupEx) { // The deployment already failed; a failed cleanup write must not // mask the original error. Log loudly so an operator can reconcile. _logger.LogError(cleanupEx, "Failed to persist Failed status for deployment {DeploymentId} of instance {Instance} " + "after deployment error: {Error}", deploymentId, instance.UniqueName, ex.Message); } _logger.LogError(ex, "Deployment {DeploymentId} for instance {Instance} failed", deploymentId, instance.UniqueName); return Result.Failure( isTimeout ? $"Deployment timed out: {ex.Message}" : $"Deployment failed: {ex.Message}"); } } /// /// WP-6: Disable an instance. Stops Instance Actor, retains config, S&F drains. /// /// The database ID of the instance to disable. /// The username initiating the operation, recorded in the audit log. /// Cancellation token for the operation. /// A task that resolves to a success result with the site response, or a failure result with an error message. public async Task> DisableInstanceAsync( int instanceId, string user, CancellationToken cancellationToken = default) { var instance = await _repository.GetInstanceByIdAsync(instanceId, cancellationToken); if (instance == null) return Result.Failure($"Instance with ID {instanceId} not found."); var transitionError = StateTransitionValidator.ValidateTransition(instance.State, "disable"); if (transitionError != null) return Result.Failure(transitionError); using var lockHandle = await _lockManager.AcquireAsync( instance.UniqueName, _options.OperationLockTimeout, cancellationToken); var commandId = Guid.NewGuid().ToString("N"); var siteId = await ResolveSiteIdentifierAsync(instance.SiteId, cancellationToken); var command = new DisableInstanceCommand(commandId, instance.UniqueName, DateTimeOffset.UtcNow); // WP-6: bound the round-trip with the configured lifecycle timeout so a // hung/unreachable site does not block the operation lock indefinitely. InstanceLifecycleResponse response; try { using var cts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken); cts.CancelAfter(_options.LifecycleCommandTimeout); response = await _communicationService.DisableInstanceAsync(siteId, command, cts.Token); } catch (Exception ex) when (ex is TimeoutException or OperationCanceledException) { // DeploymentManager-019: a lifecycle command timeout produced no // audit row pre-fix — the operator saw a timeout in the UI but // the audit trail showed nothing happened, contrary to the // design's "audit logging for all instance lifecycle changes" // rule. Mirror the DeployFailed pattern: write a "TimedOut" // entry with CancellationToken.None so a cancelled outer token // (the typical reason this catch ran) cannot prevent the // durable audit write. await TryLogLifecycleTimeoutAsync( user, "DisableTimedOut", instanceId, instance.UniqueName, commandId, ex); _logger.LogWarning(ex, "Disable of instance {Instance} timed out", instance.UniqueName); return Result.Failure( $"Disable failed: the site did not respond within {_options.LifecycleCommandTimeout}."); } if (response.Success) { instance.State = InstanceState.Disabled; await _repository.UpdateInstanceAsync(instance, cancellationToken); await _repository.SaveChangesAsync(cancellationToken); } await _auditService.LogAsync(user, "Disable", "Instance", instanceId.ToString(), instance.UniqueName, new { CommandId = commandId, response.Success }, cancellationToken); return response.Success ? Result.Success(response) : Result.Failure(response.ErrorMessage ?? "Disable failed."); } /// /// WP-6: Enable an instance. Re-creates Instance Actor from stored config. /// /// The database ID of the instance to enable. /// The username initiating the operation, recorded in the audit log. /// Cancellation token for the operation. /// A task that resolves to a success result with the site response, or a failure result with an error message. public async Task> EnableInstanceAsync( int instanceId, string user, CancellationToken cancellationToken = default) { var instance = await _repository.GetInstanceByIdAsync(instanceId, cancellationToken); if (instance == null) return Result.Failure($"Instance with ID {instanceId} not found."); var transitionError = StateTransitionValidator.ValidateTransition(instance.State, "enable"); if (transitionError != null) return Result.Failure(transitionError); using var lockHandle = await _lockManager.AcquireAsync( instance.UniqueName, _options.OperationLockTimeout, cancellationToken); var commandId = Guid.NewGuid().ToString("N"); var siteId = await ResolveSiteIdentifierAsync(instance.SiteId, cancellationToken); var command = new EnableInstanceCommand(commandId, instance.UniqueName, DateTimeOffset.UtcNow); // WP-6: bound the round-trip with the configured lifecycle timeout. InstanceLifecycleResponse response; try { using var cts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken); cts.CancelAfter(_options.LifecycleCommandTimeout); response = await _communicationService.EnableInstanceAsync(siteId, command, cts.Token); } catch (Exception ex) when (ex is TimeoutException or OperationCanceledException) { // DeploymentManager-019: emit an audit entry on lifecycle timeout // so the operator's attempted Enable is recorded; see the matching // comment in DisableInstanceAsync for the full rationale. await TryLogLifecycleTimeoutAsync( user, "EnableTimedOut", instanceId, instance.UniqueName, commandId, ex); _logger.LogWarning(ex, "Enable of instance {Instance} timed out", instance.UniqueName); return Result.Failure( $"Enable failed: the site did not respond within {_options.LifecycleCommandTimeout}."); } if (response.Success) { instance.State = InstanceState.Enabled; await _repository.UpdateInstanceAsync(instance, cancellationToken); await _repository.SaveChangesAsync(cancellationToken); } await _auditService.LogAsync(user, "Enable", "Instance", instanceId.ToString(), instance.UniqueName, new { CommandId = commandId, response.Success }, cancellationToken); return response.Success ? Result.Success(response) : Result.Failure(response.ErrorMessage ?? "Enable failed."); } /// /// WP-6: Delete an instance. Stops the site actor, removes site config, and /// removes the central instance record (deployment history, snapshot, /// overrides, and connection bindings go with it). S&F NOT cleared. /// Delete fails if the site is unreachable within /// CommunicationOptions.LifecycleTimeout (applied inside /// ). /// /// The database ID of the instance to delete. /// The username initiating the deletion, recorded in the audit log. /// Cancellation token for the operation. /// A task that resolves to a success result with the site response, or a failure result with an error message. public async Task> DeleteInstanceAsync( int instanceId, string user, CancellationToken cancellationToken = default) { var instance = await _repository.GetInstanceByIdAsync(instanceId, cancellationToken); if (instance == null) return Result.Failure($"Instance with ID {instanceId} not found."); var transitionError = StateTransitionValidator.ValidateTransition(instance.State, "delete"); if (transitionError != null) return Result.Failure(transitionError); using var lockHandle = await _lockManager.AcquireAsync( instance.UniqueName, _options.OperationLockTimeout, cancellationToken); var commandId = Guid.NewGuid().ToString("N"); var siteId = await ResolveSiteIdentifierAsync(instance.SiteId, cancellationToken); var command = new DeleteInstanceCommand(commandId, instance.UniqueName, DateTimeOffset.UtcNow); // WP-6: bound the round-trip with the configured lifecycle timeout. InstanceLifecycleResponse response; try { using var cts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken); cts.CancelAfter(_options.LifecycleCommandTimeout); response = await _communicationService.DeleteInstanceAsync(siteId, command, cts.Token); } catch (Exception ex) when (ex is TimeoutException or OperationCanceledException) { // DeploymentManager-019: emit an audit entry on lifecycle timeout // so the operator's attempted Delete is recorded; see the matching // comment in DisableInstanceAsync for the full rationale. await TryLogLifecycleTimeoutAsync( user, "DeleteTimedOut", instanceId, instance.UniqueName, commandId, ex); _logger.LogWarning(ex, "Delete of instance {Instance} timed out", instance.UniqueName); return Result.Failure( $"Delete failed: the site did not respond within {_options.LifecycleCommandTimeout}."); } if (response.Success) { // Delete means delete: remove the instance record entirely. // Deployment records, snapshot, overrides, and connection bindings // are removed with it (see repository implementation). // // DeploymentManager-004: the site has already destroyed the Instance // Actor and removed its config. If the central record removal now // fails (DB error / concurrency), the exception must NOT escape // uncaught -- that would leave the central record orphaned and // un-deletable through the normal path (a re-issued delete may fail // because the site no longer has the instance). Surface a distinct // failure so an operator can reconcile. try { await _repository.DeleteInstanceAsync(instanceId, cancellationToken); await _repository.SaveChangesAsync(cancellationToken); } catch (Exception ex) { _logger.LogError(ex, "Instance {Instance} was deleted at the site, but the central record could not be " + "removed -- the central record is now orphaned and must be reconciled manually", instance.UniqueName); await _auditService.LogAsync(user, "DeleteOrphaned", "Instance", instanceId.ToString(), instance.UniqueName, new { CommandId = commandId, Error = ex.Message }, CancellationToken.None); return Result.Failure( $"The site deleted instance '{instance.UniqueName}', but the central record could not " + $"be removed: {ex.Message}. The central record is orphaned and must be reconciled."); } } await _auditService.LogAsync(user, "Delete", "Instance", instanceId.ToString(), instance.UniqueName, new { CommandId = commandId, response.Success }, cancellationToken); return response.Success ? Result.Success(response) : Result.Failure( response.ErrorMessage ?? "Delete failed. Site may be unreachable."); } /// /// WP-8: Get the deployed config snapshot and compare with current /// template-derived state. Produces both a staleness flag and — per the /// design's "Diff View" — a structured of /// added/removed/changed attributes, alarms, and scripts (including data /// connection binding changes) computed by the TemplateEngine /// . /// /// The database ID of the instance to compare. /// Cancellation token for the operation. /// A task that resolves to a success result with the comparison, or a failure result if no snapshot exists. public async Task> GetDeploymentComparisonAsync( int instanceId, CancellationToken cancellationToken = default) { var snapshot = await _repository.GetDeployedSnapshotByInstanceIdAsync(instanceId, cancellationToken); if (snapshot == null) return Result.Failure("No deployed snapshot found for this instance."); // Compute current template-derived config var currentResult = await _flatteningPipeline.FlattenAndValidateAsync(instanceId, cancellationToken); if (currentResult.IsFailure) return Result.Failure($"Cannot compute current config: {currentResult.Error}"); var currentConfig = currentResult.Value.Configuration; var currentHash = currentResult.Value.RevisionHash; // I-1 (latent): the snapshot's ConfigurationJson + RevisionHash froze the // FLATTENED config at deploy time. The current config is a FRESH flatten, // now always in native List form (#93 consolidated element-type/coercion // into AttributeValueCodec, which emits native-form JSON arrays). A List // attribute deployed in the OLD quoted form (e.g. ["10","20"]) therefore // both (a) hashes differently from the native re-flatten — a spurious // stale flag — and (b) shows a spurious Changed attribute in the diff // (DiffService.AttributesEqual is an ordinal Value comparison). Normalize // the deserialized snapshot's List values through AttributeValueCodec // Decode→Encode so an old-form value becomes native form and compares // equal to the native re-flatten, then drive BOTH the staleness hash and // the diff off that normalized snapshot. Scalars are left untouched. // // DeploymentManager-007: a snapshot that cannot be deserialized (corrupt / // older schema) still yields the frozen-hash staleness result, with a // null diff. var deployedRevisionHash = snapshot.RevisionHash; ConfigurationDiff? diff = null; try { var deployedConfig = JsonSerializer.Deserialize(snapshot.ConfigurationJson); if (deployedConfig != null) { deployedConfig = NormalizeListAttributeValues(deployedConfig); // Recompute the deployed-side hash from the normalized snapshot so // an old-form List value is not flagged stale against the native // re-flatten. For a faithfully-stored scalar-only snapshot this // reproduces the frozen RevisionHash exactly, so behaviour is // unchanged outside the List-normalization case. deployedRevisionHash = _revisionHashService.ComputeHash(deployedConfig); diff = _diffService.ComputeDiff( deployedConfig, currentConfig, deployedRevisionHash, currentHash); } else { _logger.LogWarning( "Deployed snapshot for instance {InstanceId} deserialized to null; " + "returning hash-based comparison without a structured diff", instanceId); } } catch (JsonException ex) { _logger.LogWarning(ex, "Could not deserialize deployed snapshot for instance {InstanceId}; " + "returning hash-based comparison without a structured diff", instanceId); } var isStale = deployedRevisionHash != currentHash; var result = new DeploymentComparisonResult( instanceId, deployedRevisionHash, currentHash, isStale, snapshot.DeployedAt, diff); return Result.Success(result); } /// /// I-1 (latent): returns a copy of whose /// attribute values have been round-tripped through /// /// (native JSON-array form). This normalizes a value deployed in the OLD quoted /// form (e.g. ["10","20"]) to the native form ([10,20]) the current /// flattener now produces, so the staleness hash and the structured diff do not /// report a spurious change. Scalar / string attributes are returned unchanged /// (only is normalized). A value that cannot be /// decoded (malformed JSON, bad element, or an unparseable element type) is left /// as-is — a normalization failure must never break the read-only comparison. /// private ResolvedAttribute NormalizeListAttribute(ResolvedAttribute attr) { if (!string.Equals(attr.DataType, nameof(DataType.List), StringComparison.OrdinalIgnoreCase) || string.IsNullOrEmpty(attr.Value)) { return attr; } if (!Enum.TryParse(attr.ElementDataType, ignoreCase: true, out var elementType) || !AttributeValueCodec.IsValidElementType(elementType)) { return attr; } try { var normalized = AttributeValueCodec.Encode( AttributeValueCodec.Decode(attr.Value, DataType.List, elementType)); return normalized == attr.Value ? attr : attr with { Value = normalized }; } catch (FormatException ex) { // Best-effort: a snapshot value that no longer round-trips is left // untouched rather than aborting the comparison. Logged so an operator // can investigate the stored value. _logger.LogWarning(ex, "Could not normalize List attribute '{Attribute}' in deployed snapshot; " + "comparing its stored value verbatim", attr.CanonicalName); return attr; } } /// /// I-1 (latent): applies to every attribute /// in , returning the original instance unchanged when /// no List value needed normalizing (the common scalar-only case). /// private FlattenedConfiguration NormalizeListAttributeValues(FlattenedConfiguration config) { if (config.Attributes.Count == 0) return config; var normalized = config.Attributes.Select(NormalizeListAttribute).ToList(); var changed = normalized.Where((a, i) => !ReferenceEquals(a, config.Attributes[i])).Any(); return changed ? config with { Attributes = normalized } : config; } /// /// WP-2: Returns the current persisted for /// the given deployment ID from the configuration database. This is a pure /// local DB read — it does not contact the site. The query-the-site-before- /// redeploy reconciliation (design: "Deployment Identity & Idempotency") /// lives in , which /// invokes on the deploy path. /// /// The unique deployment identifier to look up. /// Cancellation token for the operation. /// A task that resolves to the matching deployment record, or null if none exists. public async Task GetDeploymentStatusAsync( string deploymentId, CancellationToken cancellationToken = default) { return await _repository.GetDeploymentByDeploymentIdAsync(deploymentId, cancellationToken); } /// /// DeploymentManager-006: query-the-site-before-redeploy reconciliation. /// /// The site query is issued ONLY when a prior /// for this instance is stuck , or /// is due to a timeout — the only /// cases where the site may have applied the config without central /// learning of it. Fresh first-time deploys and redeploys after a clean /// prior skip the extra round-trip. /// /// Reconciliation: if the site already has the TARGET revision hash, the /// prior record is marked (with its /// corrected to the target — /// DeploymentManager-016) and returned (the caller must NOT re-send the /// deploy). The same post-success side effects as the normal deploy path /// are applied — instance and a stored /// (DeploymentManager-015) — so central /// and site state do not diverge. Otherwise null is returned and the /// normal deploy proceeds. /// /// Query failure: if the site is unreachable or the query times out, this /// returns null (fall through to a normal deploy) — site-side /// stale-rejection of an older revision hash is the safety net. The deploy /// is never aborted on a failed query. /// private async Task TryReconcileWithSiteAsync( Instance instance, string targetRevisionHash, string configJson, string currentUser, CancellationToken cancellationToken) { var prior = await _repository.GetCurrentDeploymentStatusAsync(instance.Id, cancellationToken); if (prior == null || !ShouldQuerySiteBeforeRedeploy(prior)) return null; DeploymentStateQueryResponse response; try { var siteId = await ResolveSiteIdentifierAsync(instance.SiteId, cancellationToken); var query = new DeploymentStateQueryRequest( Guid.NewGuid().ToString("N"), instance.UniqueName, DateTimeOffset.UtcNow); _logger.LogInformation( "Querying site {SiteId} for applied deployment state of instance {Instance} " + "before re-deploy (prior record {DeploymentId} is {Status})", siteId, instance.UniqueName, prior.DeploymentId, prior.Status); response = await _communicationService.QueryDeploymentStateAsync( siteId, query, cancellationToken); } catch (Exception ex) { // Query failure (site unreachable / timeout): do NOT abort. Fall // through to a normal deploy; site-side stale-rejection of an older // revision hash is the safety net. _logger.LogWarning(ex, "Site query before re-deploy of instance {Instance} failed; " + "proceeding with normal deploy (site-side stale-rejection is the safety net)", instance.UniqueName); return null; } if (response.IsDeployed && string.Equals(response.AppliedRevisionHash, targetRevisionHash, StringComparison.Ordinal)) { // The site already has the target revision — the prior deployment // actually succeeded. Reconcile the stale record instead of // re-sending the deploy. _logger.LogInformation( "Site already has target revision {RevisionHash} for instance {Instance}; " + "marking prior deployment record {DeploymentId} Success without re-deploying", targetRevisionHash, instance.UniqueName, prior.DeploymentId); prior.Status = DeploymentStatus.Success; prior.ErrorMessage = null; prior.CompletedAt = DateTimeOffset.UtcNow; // DeploymentManager-016: the prior record can legitimately carry a // different (stale) revision hash than the current target. The site // confirmed it is running the target revision, so the persisted // record, the audit entry below, and the site must all agree. prior.RevisionHash = targetRevisionHash; await _repository.UpdateDeploymentRecordAsync(prior, cancellationToken); await _repository.SaveChangesAsync(cancellationToken); NotifyStatusChange(prior); // DeploymentManager-015: a reconciled deployment must perform the // SAME post-success side effects as the normal deploy path — set // the instance State to Enabled and store/refresh the deployed // config snapshot — otherwise the central state machine and the // deployed-snapshot invariant diverge from what the site is running. // // DeploymentManager-018: the reconciliation path runs only when the // prior record is InProgress or timeout-Failed — exactly the cases // that survive a central failover. The in-memory operation lock is // lost on failover, so an operator may have legitimately invoked // Disable on the instance between the original timed-out deploy and // this redeploy. Disable does not change the deployed config, so the // site still reports the target revision hash. Reconciliation must // therefore PRESERVE an intentional Disabled state instead of // silently flipping it back to Enabled — pass forceEnabledState: // false so the helper only promotes NotDeployed → Enabled (the // first-deploy-timed-out case) and leaves an explicit Disabled // alone. await ApplyPostSuccessSideEffectsAsync( instance, prior.DeploymentId, targetRevisionHash, configJson, forceEnabledState: false, cancellationToken); // DeploymentManager-020: attribute the audit row to the user driving // THIS redeploy (the caller of DeployInstanceAsync), not the user // who issued the original timed-out / stuck deployment. The original // deployer is preserved in the detail object so forensics can still // see who launched the run that reconciliation rescued. await _auditService.LogAsync(currentUser, "DeployReconciled", "Instance", instance.Id.ToString(), instance.UniqueName, new { DeploymentId = prior.DeploymentId, RevisionHash = targetRevisionHash, OriginalDeployer = prior.DeployedBy }, cancellationToken); return prior; } // Site does not have the target revision (or is not deployed) — proceed // with the normal deploy. return null; } /// /// DeploymentManager-006: the site is queried before a re-deploy only when a /// prior record is stuck , or is /// because the site command timed out /// (detected via the error-message /// marker). All other prior states skip the query. /// private static bool ShouldQuerySiteBeforeRedeploy(DeploymentRecord prior) => prior.Status == DeploymentStatus.InProgress || (prior.Status == DeploymentStatus.Failed && prior.ErrorMessage != null && prior.ErrorMessage.StartsWith(TimeoutFailurePrefix, StringComparison.Ordinal)); /// /// Post-success side effects shared by the normal deploy path and the /// DeploymentManager-006 reconciliation path: set the instance /// (WP-4) and store/refresh the /// deployed config snapshot (WP-8). Factored into one helper so the two /// paths cannot drift (DeploymentManager-015). /// /// DeploymentManager-018: distinguishes /// the two callers. The normal deploy path passes true — a fresh /// successful apply legitimately puts the instance into /// (the documented "Deploy on a Disabled instance also enables it" semantics /// of ). The reconciliation path /// passes false: it is reconciling a *prior* deployment that may /// have completed before the current operator session (central failover /// loses the in-memory operation lock, so an operator may have legitimately /// Disabled the instance in between). On that path we only promote /// /// (the first-deploy-timed-out case) and leave an explicit Disabled alone, /// so reconciliation never silently undoes a Disable. /// /// Best-effort: the deployment record's terminal /// status is already committed by the caller before this runs. A failure /// here is logged loudly for operator reconciliation but is NOT propagated — /// it must not flip the already-committed Success record back to Failed. /// private async Task ApplyPostSuccessSideEffectsAsync( Instance instance, string deploymentId, string revisionHash, string configJson, bool forceEnabledState, CancellationToken cancellationToken) { try { // WP-4: Update instance state to Enabled on successful deployment. // DeploymentManager-018: on the reconciliation path // (forceEnabledState=false) only promote NotDeployed → Enabled, // preserving an intentional Disabled state set between the original // timed-out deploy and the redeploy. if (forceEnabledState || instance.State == InstanceState.NotDeployed) { instance.State = InstanceState.Enabled; } await _repository.UpdateInstanceAsync(instance, cancellationToken); // WP-8: Store deployed config snapshot await StoreDeployedSnapshotAsync( instance.Id, deploymentId, revisionHash, configJson, cancellationToken); await _repository.SaveChangesAsync(cancellationToken); } catch (Exception postEx) { _logger.LogError(postEx, "Deployment {DeploymentId} for instance {Instance} was applied by the site and " + "recorded Success, but post-success persistence (instance state / config snapshot) " + "failed -- central and site state may diverge until reconciled", deploymentId, instance.UniqueName); } } /// /// DeploymentManager-019: write a "<Action>TimedOut" audit entry on /// behalf of a lifecycle command (Disable / Enable / Delete) whose site /// round-trip exceeded . /// /// /// Mirrors the DeployFailed pattern in /// : the audit write uses /// so the operator's outer cancellation /// (the usual reason this path runs) cannot also prevent the audit row from /// being persisted. The detail object carries the lifecycle command id, the /// timeout that fired, and the original exception message so an operator can /// correlate the audit entry with the UI-surfaced timeout error. /// /// /// /// Wrapped in try/catch — a failed audit write must NOT mask the underlying /// timeout from the caller; it is logged at Warning so the operator can /// reconcile but never thrown. /// /// /// The username who initiated the lifecycle command. /// The audit action name (DisableTimedOut, EnableTimedOut, or DeleteTimedOut). /// The numeric instance id, recorded on the audit row. /// The instance unique name used as the audit target name. /// The lifecycle command's correlation id, so the audit entry can be matched to logs. /// The captured or . private async Task TryLogLifecycleTimeoutAsync( string user, string action, int instanceId, string instanceUniqueName, string commandId, Exception timeoutException) { try { await _auditService.LogAsync( user, action, "Instance", instanceId.ToString(), instanceUniqueName, new { CommandId = commandId, Deadline = _options.LifecycleCommandTimeout, Error = timeoutException.Message, }, CancellationToken.None); } catch (Exception auditEx) { // A failed audit write must not bury the timeout for the caller — // just log so an operator can investigate the audit-pipeline issue. _logger.LogWarning(auditEx, "Failed to write {Action} audit entry for instance {Instance} (commandId={CommandId})", action, instanceUniqueName, commandId); } } private async Task StoreDeployedSnapshotAsync( int instanceId, string deploymentId, string revisionHash, string configJson, CancellationToken cancellationToken) { var existing = await _repository.GetDeployedSnapshotByInstanceIdAsync(instanceId, cancellationToken); if (existing != null) { existing.DeploymentId = deploymentId; existing.RevisionHash = revisionHash; existing.ConfigurationJson = configJson; existing.DeployedAt = DateTimeOffset.UtcNow; await _repository.UpdateDeployedSnapshotAsync(existing, cancellationToken); } else { var snapshot = new DeployedConfigSnapshot(deploymentId, revisionHash, configJson) { InstanceId = instanceId, DeployedAt = DateTimeOffset.UtcNow }; await _repository.AddDeployedSnapshotAsync(snapshot, cancellationToken); } } } /// /// WP-8: Result of comparing deployed vs template-derived configuration. /// /// /// DeploymentManager-007: structured added/removed/changed detail for /// attributes, alarms, and scripts. Null only when the deployed snapshot could /// not be deserialized (corrupt / older schema), in which case /// still reflects the hash comparison. /// public record DeploymentComparisonResult( int InstanceId, string DeployedRevisionHash, string CurrentRevisionHash, bool IsStale, DateTimeOffset DeployedAt, ConfigurationDiff? Diff = null);