cdd65beb6c
Closes the four remaining items in the 2026-06-24 template-inheritance/CLI follow-up tracker. #4 — CLI `instance set-bindings` can now set DataSourceReferenceOverride. `--bindings` accepts an optional 3rd element per entry: [attributeName, dataConnectionId, dataSourceReferenceOverride]. A string sets the override; a JSON null or an omitted 3rd element leaves it unset (template default). TryParseBindings accepts 2- or 3-element entries and rejects a non-string/non-null 3rd element or 4+ elements with a clean error. Previously the CLI sent the override as null and silently wiped any existing one (only a raw POST /management could set it). #5 — `template update` is partial, not full-replace (fixed server-side so all clients benefit). UpdateTemplateAsync now uses leave-unchanged semantics: a null description keeps the stored value (pass "" to clear); a null parentTemplateId keeps the existing parent. Parent stays immutable — a non-null differing value is still rejected — but omitting --parent-id is now a no-op instead of failing every derived-template update. #6 — compact `template list`/`get` table output + `--detail`. Table output is now id/name/description/parent/derived + member counts (#attrs/#alarms/ #scripts/#comps/#nativeAlarms) via TemplateTableProjection, fed through a new optional tableProjector seam on CommandHelpers. `--detail` restores the full dump. JSON output is left untouched (always full) so machine consumers are unaffected — the projector only runs on the table path. #8 — structured deploy-time validation error. New ValidationResult.SummarizeErrors() (Commons) returns a grouped, capped summary: leading total count, one line per ValidationCategory, and a per-module rollup (canonical name up to its last dot) with counts + "... and N more module(s)" caps. DeploymentService uses it for the "Pre-deployment validation failed" message and logs the full per-entry list via LogWarning. Replaces the flat semicolon-joined dump that became a wall of text for instances with 50-194 unbound attributes. Tests: +8 Commons (SummarizeErrors), +8 CLI (4 binding 3-element / 4 table projection), +2 net TemplateEngine (partial-update). Affected suites green: Commons 587, CLI 341, TemplateEngine 447, DeploymentManager 101, ManagementService 230, CentralUI 866; full solution builds 0/0. Docs: Component-DeploymentManager.md "Validation Error Reporting"; CLI README (set-bindings 3-element form, template update leave-unchanged, list/get --detail); UpdateTemplateCommand doc; known-issues tracker #4/#5/#6/#8 resolved (all 8 items now closed).
1063 lines
54 KiB
C#
1063 lines
54 KiB
C#
using System.Text.Json;
|
|
using Microsoft.Extensions.Logging;
|
|
using Microsoft.Extensions.Options;
|
|
using ZB.MOM.WW.ScadaBridge.Commons.Entities.Deployment;
|
|
using ZB.MOM.WW.ScadaBridge.Commons.Entities.Instances;
|
|
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Repositories;
|
|
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Services;
|
|
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Deployment;
|
|
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Lifecycle;
|
|
using ZB.MOM.WW.ScadaBridge.Commons.Observability;
|
|
using ZB.MOM.WW.ScadaBridge.Commons.Types;
|
|
using ZB.MOM.WW.ScadaBridge.Commons.Types.Enums;
|
|
using ZB.MOM.WW.ScadaBridge.Commons.Types.Flattening;
|
|
using ZB.MOM.WW.ScadaBridge.Communication;
|
|
using ZB.MOM.WW.ScadaBridge.TemplateEngine.Flattening;
|
|
using ZB.MOM.WW.ScadaBridge.TemplateEngine.Validation;
|
|
|
|
namespace ZB.MOM.WW.ScadaBridge.DeploymentManager;
|
|
|
|
/// <summary>
|
|
/// WP-1: Central-side deployment orchestration service.
|
|
/// Coordinates the full deployment pipeline:
|
|
/// 1. Validate instance state transition (WP-4)
|
|
/// 2. Acquire per-instance operation lock (WP-3)
|
|
/// 3. Flatten configuration via TemplateEngine (captures template state at time of flatten -- WP-16)
|
|
/// 4. Validate flattened configuration
|
|
/// 5. Compute revision hash and diff
|
|
/// 6. Send DeployInstanceCommand to site via CommunicationService
|
|
/// 7. Track deployment status with optimistic concurrency (WP-4)
|
|
/// 8. Store deployed config snapshot (WP-8)
|
|
/// 9. Audit log all actions
|
|
///
|
|
/// WP-2: Each deployment has a unique deployment ID (GUID) + revision hash.
|
|
/// WP-16: Template state captured at flatten time -- last-write-wins on templates is safe.
|
|
/// </summary>
|
|
public class DeploymentService
|
|
{
|
|
private readonly IDeploymentManagerRepository _repository;
|
|
private readonly ISiteRepository _siteRepository;
|
|
private readonly IFlatteningPipeline _flatteningPipeline;
|
|
private readonly CommunicationService _communicationService;
|
|
private readonly OperationLockManager _lockManager;
|
|
private readonly IAuditService _auditService;
|
|
private readonly DiffService _diffService;
|
|
private readonly RevisionHashService _revisionHashService;
|
|
private readonly IDeploymentStatusNotifier _statusNotifier;
|
|
private readonly DeploymentManagerOptions _options;
|
|
private readonly ILogger<DeploymentService> _logger;
|
|
|
|
/// <summary>
|
|
/// Prefix written to <see cref="DeploymentRecord.ErrorMessage"/> when a
|
|
/// deployment fails because the site command timed out or was cancelled.
|
|
/// Used by the query-before-redeploy trigger (DeploymentManager-006) to tell
|
|
/// a timeout-induced failure apart from other deployment errors.
|
|
/// </summary>
|
|
private const string TimeoutFailurePrefix = "Communication failure:";
|
|
|
|
/// <summary>
|
|
/// Initializes a new instance of <see cref="DeploymentService"/> with all required dependencies.
|
|
/// </summary>
|
|
/// <param name="repository">Repository for deployment manager data access.</param>
|
|
/// <param name="siteRepository">Repository for site data access.</param>
|
|
/// <param name="flatteningPipeline">Pipeline for flattening and validating template configurations.</param>
|
|
/// <param name="communicationService">Service for cross-cluster communication with sites.</param>
|
|
/// <param name="lockManager">Manager for per-instance operation locks.</param>
|
|
/// <param name="auditService">Service for recording audit log entries.</param>
|
|
/// <param name="diffService">Service for computing configuration diffs.</param>
|
|
/// <param name="revisionHashService">
|
|
/// Service for recomputing a flattened configuration's revision hash. Used by
|
|
/// <see cref="GetDeploymentComparisonAsync"/> to derive the deployed-side
|
|
/// staleness hash from the (List-normalized) deserialized snapshot — see I-1.
|
|
/// </param>
|
|
/// <param name="statusNotifier">Notifier for pushing deployment status changes to the UI.</param>
|
|
/// <param name="options">Deployment manager configuration options.</param>
|
|
/// <param name="logger">Logger instance.</param>
|
|
public DeploymentService(
|
|
IDeploymentManagerRepository repository,
|
|
ISiteRepository siteRepository,
|
|
IFlatteningPipeline flatteningPipeline,
|
|
CommunicationService communicationService,
|
|
OperationLockManager lockManager,
|
|
IAuditService auditService,
|
|
DiffService diffService,
|
|
RevisionHashService revisionHashService,
|
|
IDeploymentStatusNotifier statusNotifier,
|
|
IOptions<DeploymentManagerOptions> options,
|
|
ILogger<DeploymentService> logger)
|
|
{
|
|
_repository = repository;
|
|
_siteRepository = siteRepository;
|
|
_flatteningPipeline = flatteningPipeline;
|
|
_communicationService = communicationService;
|
|
_lockManager = lockManager;
|
|
_auditService = auditService;
|
|
_diffService = diffService;
|
|
_revisionHashService = revisionHashService;
|
|
_statusNotifier = statusNotifier;
|
|
_options = options.Value;
|
|
_logger = logger;
|
|
}
|
|
|
|
/// <summary>
|
|
/// CentralUI-006: raises a push notification that a deployment record's
|
|
/// status was just persisted, so the Central UI deployment-status page can
|
|
/// re-render over its SignalR circuit instead of polling. Called at every
|
|
/// point a <see cref="DeploymentRecord"/> status is written.
|
|
/// </summary>
|
|
private void NotifyStatusChange(DeploymentRecord record) =>
|
|
_statusNotifier.NotifyStatusChanged(
|
|
new DeploymentStatusChange(record.DeploymentId, record.InstanceId, record.Status));
|
|
|
|
/// <summary>
|
|
/// Resolves the site's string identifier from the numeric DB ID.
|
|
/// The communication layer routes by string identifier (e.g. "site-a"), not DB ID.
|
|
///
|
|
/// DeploymentManager-021: when the <see cref="Site"/> row is missing (FK was
|
|
/// deleted, race with admin delete, DB inconsistency) the previous behaviour
|
|
/// silently substituted the numeric id rendered as a string — every
|
|
/// downstream `CommunicationService` call then failed with a confusing
|
|
/// "unknown site" routing error that hid the real cause. Treat a missing
|
|
/// site row as a hard validation failure: throw
|
|
/// <see cref="InvalidOperationException"/> naming the unresolved id so the
|
|
/// operator sees the actual problem. On the deploy path the existing
|
|
/// try/catch turns this into a Failed deployment record with a clear
|
|
/// message; lifecycle paths propagate it to the caller (CLI/UI) which
|
|
/// surface it as an error to the operator.
|
|
/// </summary>
|
|
private async Task<string> ResolveSiteIdentifierAsync(int siteId, CancellationToken cancellationToken)
|
|
{
|
|
var site = await _siteRepository.GetSiteByIdAsync(siteId, cancellationToken);
|
|
if (site == null)
|
|
throw new InvalidOperationException(
|
|
$"Site with ID {siteId} not found; cannot resolve its SiteIdentifier for routing.");
|
|
return site.SiteIdentifier;
|
|
}
|
|
|
|
/// <summary>
|
|
/// WP-1: Deploy an instance to its site.
|
|
/// WP-2: Generates unique deployment ID, computes revision hash.
|
|
/// WP-4: Validates state transitions, uses optimistic concurrency.
|
|
/// WP-5: Site-side apply is all-or-nothing (handled by DeploymentManagerActor).
|
|
/// WP-8: Stores deployed config snapshot on success.
|
|
/// WP-16: Captures template state at time of flatten.
|
|
/// </summary>
|
|
/// <param name="instanceId">The database ID of the instance to deploy.</param>
|
|
/// <param name="user">The username initiating the deployment, recorded in the audit log.</param>
|
|
/// <param name="cancellationToken">Cancellation token for the operation.</param>
|
|
/// <returns>A task that resolves to a success result containing the deployment record, or a failure result with an error message.</returns>
|
|
public async Task<Result<DeploymentRecord>> DeployInstanceAsync(
|
|
int instanceId,
|
|
string user,
|
|
CancellationToken cancellationToken = default)
|
|
{
|
|
// Load instance
|
|
var instance = await _repository.GetInstanceByIdAsync(instanceId, cancellationToken);
|
|
if (instance == null)
|
|
return Result<DeploymentRecord>.Failure($"Instance with ID {instanceId} not found.");
|
|
|
|
// WP-4: Validate state transition
|
|
var transitionError = StateTransitionValidator.ValidateTransition(instance.State, "deploy");
|
|
if (transitionError != null)
|
|
return Result<DeploymentRecord>.Failure(transitionError);
|
|
|
|
// WP-3: Acquire per-instance operation lock
|
|
using var lockHandle = await _lockManager.AcquireAsync(
|
|
instance.UniqueName, _options.OperationLockTimeout, cancellationToken);
|
|
|
|
// WP-2: Generate unique deployment ID
|
|
var deploymentId = Guid.NewGuid().ToString("N");
|
|
|
|
// WP-1/16: Flatten configuration (captures template state at this point in time)
|
|
var flattenResult = await _flatteningPipeline.FlattenAndValidateAsync(instanceId, cancellationToken);
|
|
if (flattenResult.IsFailure)
|
|
return Result<DeploymentRecord>.Failure($"Validation failed: {flattenResult.Error}");
|
|
|
|
var flattenedConfig = flattenResult.Value.Configuration;
|
|
var revisionHash = flattenResult.Value.RevisionHash;
|
|
var validationResult = flattenResult.Value.Validation;
|
|
|
|
if (!validationResult.IsValid)
|
|
{
|
|
// Followup #8: return a grouped/summarized error (leading count + per-module
|
|
// rollup, capped) instead of a flat semicolon-joined dump that becomes a wall
|
|
// of text for instances with dozens of unbound attributes. The full per-entry
|
|
// list still goes to the deploy log for operators who need every clause.
|
|
_logger.LogWarning(
|
|
"Pre-deployment validation failed for instance {InstanceId} ({ErrorCount} error(s)): {Detail}",
|
|
instanceId,
|
|
validationResult.Errors.Count,
|
|
string.Join("; ", validationResult.Errors.Select(e => e.Message)));
|
|
|
|
return Result<DeploymentRecord>.Failure(
|
|
$"Pre-deployment validation failed: {validationResult.SummarizeErrors()}");
|
|
}
|
|
|
|
// Serialize for transmission (also the payload stored in the deployed
|
|
// snapshot on success / reconciliation).
|
|
var configJson = JsonSerializer.Serialize(flattenedConfig);
|
|
|
|
// DeploymentManager-006: query-the-site-before-redeploy idempotency.
|
|
// If a prior deployment for this instance is stuck InProgress or Failed
|
|
// due to a timeout, the site may have actually applied the config. Query
|
|
// the site for its currently-applied revision before re-sending so a
|
|
// duplicate deployment is not produced (design: "Deployment Identity &
|
|
// Idempotency"). A clean prior Success or a fresh first-time deploy
|
|
// skips this extra round-trip.
|
|
var reconciled = await TryReconcileWithSiteAsync(
|
|
instance, revisionHash, configJson, user, cancellationToken);
|
|
if (reconciled != null)
|
|
return Result<DeploymentRecord>.Success(reconciled);
|
|
|
|
// WP-4: Create the deployment record directly in InProgress.
|
|
//
|
|
// DeploymentManager-022: the previous code wrote the record as Pending,
|
|
// then immediately updated it to InProgress with no work in between
|
|
// (flattening, validation, and reconciliation all completed above). The
|
|
// back-to-back write cost an extra SaveChangesAsync round-trip, an
|
|
// extra IDeploymentStatusNotifier push (CentralUI-006 rendered a
|
|
// Pending→InProgress flicker for ~ms), and an extra row-version bump
|
|
// for nothing. The transient Pending slot carried no operational
|
|
// meaning — it was set and immediately overwritten — so dropping it
|
|
// collapses the start of the deploy into a single insert + notify.
|
|
// InProgress remains the documented "sent to site, awaiting response"
|
|
// state, set immediately before the round-trip below.
|
|
var record = new DeploymentRecord(deploymentId, user)
|
|
{
|
|
InstanceId = instanceId,
|
|
Status = DeploymentStatus.InProgress,
|
|
RevisionHash = revisionHash,
|
|
DeployedAt = DateTimeOffset.UtcNow
|
|
};
|
|
|
|
await _repository.AddDeploymentRecordAsync(record, cancellationToken);
|
|
await _repository.SaveChangesAsync(cancellationToken);
|
|
NotifyStatusChange(record);
|
|
|
|
try
|
|
{
|
|
// WP-1: Send to site via CommunicationService
|
|
var siteId = await ResolveSiteIdentifierAsync(instance.SiteId, cancellationToken);
|
|
var command = new DeployInstanceCommand(
|
|
deploymentId, instance.UniqueName, revisionHash, configJson, user, DateTimeOffset.UtcNow);
|
|
|
|
_logger.LogInformation(
|
|
"Sending deployment {DeploymentId} for instance {Instance} to site {SiteId}",
|
|
deploymentId, instance.UniqueName, siteId);
|
|
|
|
var response = await _communicationService.DeployInstanceAsync(siteId, command, cancellationToken);
|
|
|
|
// WP-1: Update status based on site response.
|
|
record.Status = response.Status;
|
|
record.ErrorMessage = response.ErrorMessage;
|
|
record.CompletedAt = DateTimeOffset.UtcNow;
|
|
|
|
// DeploymentManager-003: once the site has confirmed the apply,
|
|
// commit the deployment record's terminal status BEFORE touching
|
|
// instance state and the deployed-config snapshot. If a later write
|
|
// (instance update / snapshot store) fails, the recorded fact that
|
|
// the site succeeded must NOT be lost -- otherwise central reports a
|
|
// non-Success record while the site is running the new config.
|
|
await _repository.UpdateDeploymentRecordAsync(record, cancellationToken);
|
|
await _repository.SaveChangesAsync(cancellationToken);
|
|
NotifyStatusChange(record);
|
|
|
|
if (response.Status == DeploymentStatus.Success)
|
|
{
|
|
// Telemetry: one instance deployment successfully applied to a
|
|
// site. Counted once per successful deploy operation (the unit
|
|
// of scadabridge.deployments.applied — one DeployInstanceAsync
|
|
// deploys exactly one instance to one site). Emitted only on this
|
|
// confirmed-Success path, so failures, timeouts/retries (the
|
|
// catch block), and the reconciliation path (which recovers a
|
|
// PRIOR timed-out apply rather than performing a fresh one) do
|
|
// not increment it.
|
|
ScadaBridgeTelemetry.RecordDeploymentApplied();
|
|
|
|
// The site has applied the deployment. The post-success
|
|
// persistence below is best-effort: a failure here must be
|
|
// logged loudly for operator reconciliation but must not flip
|
|
// the already-committed Success record back to Failed.
|
|
await ApplyPostSuccessSideEffectsAsync(
|
|
instance, deploymentId, revisionHash, configJson,
|
|
forceEnabledState: true, cancellationToken);
|
|
}
|
|
|
|
// Audit log
|
|
await _auditService.LogAsync(user, "Deploy", "Instance", instanceId.ToString(),
|
|
instance.UniqueName, new { DeploymentId = deploymentId, Status = record.Status.ToString() },
|
|
cancellationToken);
|
|
|
|
_logger.LogInformation(
|
|
"Deployment {DeploymentId} for instance {Instance}: {Status}",
|
|
deploymentId, instance.UniqueName, record.Status);
|
|
|
|
return record.Status == DeploymentStatus.Success
|
|
? Result<DeploymentRecord>.Success(record)
|
|
: Result<DeploymentRecord>.Failure(
|
|
$"Deployment failed: {response.ErrorMessage ?? "Unknown error"}");
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
// DeploymentManager-001: any exception out of the try (timeout,
|
|
// cancellation, transport, serialization, DB) must leave the
|
|
// deployment record as Failed -- the design requires an interrupted
|
|
// deployment to be treated as failed, never stuck in InProgress.
|
|
//
|
|
// DeploymentManager-002: the failure-status write must NOT use the
|
|
// operation's cancellation token. If the operation was cancelled or
|
|
// timed out, that token is already cancelled and the cleanup writes
|
|
// would themselves throw before the Failed status is persisted.
|
|
// Use CancellationToken.None so the failure is durably recorded.
|
|
var isTimeout = ex is TimeoutException or OperationCanceledException;
|
|
|
|
record.Status = DeploymentStatus.Failed;
|
|
record.ErrorMessage = isTimeout
|
|
? $"{TimeoutFailurePrefix} {ex.Message}"
|
|
: $"Deployment error: {ex.Message}";
|
|
record.CompletedAt = DateTimeOffset.UtcNow;
|
|
|
|
try
|
|
{
|
|
await _repository.UpdateDeploymentRecordAsync(record, CancellationToken.None);
|
|
await _repository.SaveChangesAsync(CancellationToken.None);
|
|
NotifyStatusChange(record);
|
|
|
|
await _auditService.LogAsync(user, "DeployFailed", "Instance", instanceId.ToString(),
|
|
instance.UniqueName, new { DeploymentId = deploymentId, Error = ex.Message },
|
|
CancellationToken.None);
|
|
}
|
|
catch (Exception cleanupEx)
|
|
{
|
|
// The deployment already failed; a failed cleanup write must not
|
|
// mask the original error. Log loudly so an operator can reconcile.
|
|
_logger.LogError(cleanupEx,
|
|
"Failed to persist Failed status for deployment {DeploymentId} of instance {Instance} " +
|
|
"after deployment error: {Error}",
|
|
deploymentId, instance.UniqueName, ex.Message);
|
|
}
|
|
|
|
_logger.LogError(ex,
|
|
"Deployment {DeploymentId} for instance {Instance} failed",
|
|
deploymentId, instance.UniqueName);
|
|
|
|
return Result<DeploymentRecord>.Failure(
|
|
isTimeout
|
|
? $"Deployment timed out: {ex.Message}"
|
|
: $"Deployment failed: {ex.Message}");
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// WP-6: Disable an instance. Stops Instance Actor, retains config, S&F drains.
|
|
/// </summary>
|
|
/// <param name="instanceId">The database ID of the instance to disable.</param>
|
|
/// <param name="user">The username initiating the operation, recorded in the audit log.</param>
|
|
/// <param name="cancellationToken">Cancellation token for the operation.</param>
|
|
/// <returns>A task that resolves to a success result with the site response, or a failure result with an error message.</returns>
|
|
public async Task<Result<InstanceLifecycleResponse>> DisableInstanceAsync(
|
|
int instanceId,
|
|
string user,
|
|
CancellationToken cancellationToken = default)
|
|
{
|
|
var instance = await _repository.GetInstanceByIdAsync(instanceId, cancellationToken);
|
|
if (instance == null)
|
|
return Result<InstanceLifecycleResponse>.Failure($"Instance with ID {instanceId} not found.");
|
|
|
|
var transitionError = StateTransitionValidator.ValidateTransition(instance.State, "disable");
|
|
if (transitionError != null)
|
|
return Result<InstanceLifecycleResponse>.Failure(transitionError);
|
|
|
|
using var lockHandle = await _lockManager.AcquireAsync(
|
|
instance.UniqueName, _options.OperationLockTimeout, cancellationToken);
|
|
|
|
var commandId = Guid.NewGuid().ToString("N");
|
|
var siteId = await ResolveSiteIdentifierAsync(instance.SiteId, cancellationToken);
|
|
var command = new DisableInstanceCommand(commandId, instance.UniqueName, DateTimeOffset.UtcNow);
|
|
|
|
// WP-6: bound the round-trip with the configured lifecycle timeout so a
|
|
// hung/unreachable site does not block the operation lock indefinitely.
|
|
InstanceLifecycleResponse response;
|
|
try
|
|
{
|
|
using var cts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken);
|
|
cts.CancelAfter(_options.LifecycleCommandTimeout);
|
|
response = await _communicationService.DisableInstanceAsync(siteId, command, cts.Token);
|
|
}
|
|
catch (Exception ex) when (ex is TimeoutException or OperationCanceledException)
|
|
{
|
|
// DeploymentManager-019: a lifecycle command timeout produced no
|
|
// audit row pre-fix — the operator saw a timeout in the UI but
|
|
// the audit trail showed nothing happened, contrary to the
|
|
// design's "audit logging for all instance lifecycle changes"
|
|
// rule. Mirror the DeployFailed pattern: write a "<Action>TimedOut"
|
|
// entry with CancellationToken.None so a cancelled outer token
|
|
// (the typical reason this catch ran) cannot prevent the
|
|
// durable audit write.
|
|
await TryLogLifecycleTimeoutAsync(
|
|
user, "DisableTimedOut", instanceId, instance.UniqueName, commandId, ex);
|
|
|
|
_logger.LogWarning(ex, "Disable of instance {Instance} timed out", instance.UniqueName);
|
|
return Result<InstanceLifecycleResponse>.Failure(
|
|
$"Disable failed: the site did not respond within {_options.LifecycleCommandTimeout}.");
|
|
}
|
|
|
|
if (response.Success)
|
|
{
|
|
instance.State = InstanceState.Disabled;
|
|
await _repository.UpdateInstanceAsync(instance, cancellationToken);
|
|
await _repository.SaveChangesAsync(cancellationToken);
|
|
}
|
|
|
|
await _auditService.LogAsync(user, "Disable", "Instance", instanceId.ToString(),
|
|
instance.UniqueName, new { CommandId = commandId, response.Success },
|
|
cancellationToken);
|
|
|
|
return response.Success
|
|
? Result<InstanceLifecycleResponse>.Success(response)
|
|
: Result<InstanceLifecycleResponse>.Failure(response.ErrorMessage ?? "Disable failed.");
|
|
}
|
|
|
|
/// <summary>
|
|
/// WP-6: Enable an instance. Re-creates Instance Actor from stored config.
|
|
/// </summary>
|
|
/// <param name="instanceId">The database ID of the instance to enable.</param>
|
|
/// <param name="user">The username initiating the operation, recorded in the audit log.</param>
|
|
/// <param name="cancellationToken">Cancellation token for the operation.</param>
|
|
/// <returns>A task that resolves to a success result with the site response, or a failure result with an error message.</returns>
|
|
public async Task<Result<InstanceLifecycleResponse>> EnableInstanceAsync(
|
|
int instanceId,
|
|
string user,
|
|
CancellationToken cancellationToken = default)
|
|
{
|
|
var instance = await _repository.GetInstanceByIdAsync(instanceId, cancellationToken);
|
|
if (instance == null)
|
|
return Result<InstanceLifecycleResponse>.Failure($"Instance with ID {instanceId} not found.");
|
|
|
|
var transitionError = StateTransitionValidator.ValidateTransition(instance.State, "enable");
|
|
if (transitionError != null)
|
|
return Result<InstanceLifecycleResponse>.Failure(transitionError);
|
|
|
|
using var lockHandle = await _lockManager.AcquireAsync(
|
|
instance.UniqueName, _options.OperationLockTimeout, cancellationToken);
|
|
|
|
var commandId = Guid.NewGuid().ToString("N");
|
|
var siteId = await ResolveSiteIdentifierAsync(instance.SiteId, cancellationToken);
|
|
var command = new EnableInstanceCommand(commandId, instance.UniqueName, DateTimeOffset.UtcNow);
|
|
|
|
// WP-6: bound the round-trip with the configured lifecycle timeout.
|
|
InstanceLifecycleResponse response;
|
|
try
|
|
{
|
|
using var cts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken);
|
|
cts.CancelAfter(_options.LifecycleCommandTimeout);
|
|
response = await _communicationService.EnableInstanceAsync(siteId, command, cts.Token);
|
|
}
|
|
catch (Exception ex) when (ex is TimeoutException or OperationCanceledException)
|
|
{
|
|
// DeploymentManager-019: emit an audit entry on lifecycle timeout
|
|
// so the operator's attempted Enable is recorded; see the matching
|
|
// comment in DisableInstanceAsync for the full rationale.
|
|
await TryLogLifecycleTimeoutAsync(
|
|
user, "EnableTimedOut", instanceId, instance.UniqueName, commandId, ex);
|
|
|
|
_logger.LogWarning(ex, "Enable of instance {Instance} timed out", instance.UniqueName);
|
|
return Result<InstanceLifecycleResponse>.Failure(
|
|
$"Enable failed: the site did not respond within {_options.LifecycleCommandTimeout}.");
|
|
}
|
|
|
|
if (response.Success)
|
|
{
|
|
instance.State = InstanceState.Enabled;
|
|
await _repository.UpdateInstanceAsync(instance, cancellationToken);
|
|
await _repository.SaveChangesAsync(cancellationToken);
|
|
}
|
|
|
|
await _auditService.LogAsync(user, "Enable", "Instance", instanceId.ToString(),
|
|
instance.UniqueName, new { CommandId = commandId, response.Success },
|
|
cancellationToken);
|
|
|
|
return response.Success
|
|
? Result<InstanceLifecycleResponse>.Success(response)
|
|
: Result<InstanceLifecycleResponse>.Failure(response.ErrorMessage ?? "Enable failed.");
|
|
}
|
|
|
|
/// <summary>
|
|
/// WP-6: Delete an instance. Stops the site actor, removes site config, and
|
|
/// removes the central instance record (deployment history, snapshot,
|
|
/// overrides, and connection bindings go with it). S&F NOT cleared.
|
|
/// Delete fails if the site is unreachable within
|
|
/// <c>CommunicationOptions.LifecycleTimeout</c> (applied inside
|
|
/// <see cref="CommunicationService.DeleteInstanceAsync"/>).
|
|
/// </summary>
|
|
/// <param name="instanceId">The database ID of the instance to delete.</param>
|
|
/// <param name="user">The username initiating the deletion, recorded in the audit log.</param>
|
|
/// <param name="cancellationToken">Cancellation token for the operation.</param>
|
|
/// <returns>A task that resolves to a success result with the site response, or a failure result with an error message.</returns>
|
|
public async Task<Result<InstanceLifecycleResponse>> DeleteInstanceAsync(
|
|
int instanceId,
|
|
string user,
|
|
CancellationToken cancellationToken = default)
|
|
{
|
|
var instance = await _repository.GetInstanceByIdAsync(instanceId, cancellationToken);
|
|
if (instance == null)
|
|
return Result<InstanceLifecycleResponse>.Failure($"Instance with ID {instanceId} not found.");
|
|
|
|
var transitionError = StateTransitionValidator.ValidateTransition(instance.State, "delete");
|
|
if (transitionError != null)
|
|
return Result<InstanceLifecycleResponse>.Failure(transitionError);
|
|
|
|
using var lockHandle = await _lockManager.AcquireAsync(
|
|
instance.UniqueName, _options.OperationLockTimeout, cancellationToken);
|
|
|
|
var commandId = Guid.NewGuid().ToString("N");
|
|
var siteId = await ResolveSiteIdentifierAsync(instance.SiteId, cancellationToken);
|
|
var command = new DeleteInstanceCommand(commandId, instance.UniqueName, DateTimeOffset.UtcNow);
|
|
|
|
// WP-6: bound the round-trip with the configured lifecycle timeout.
|
|
InstanceLifecycleResponse response;
|
|
try
|
|
{
|
|
using var cts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken);
|
|
cts.CancelAfter(_options.LifecycleCommandTimeout);
|
|
response = await _communicationService.DeleteInstanceAsync(siteId, command, cts.Token);
|
|
}
|
|
catch (Exception ex) when (ex is TimeoutException or OperationCanceledException)
|
|
{
|
|
// DeploymentManager-019: emit an audit entry on lifecycle timeout
|
|
// so the operator's attempted Delete is recorded; see the matching
|
|
// comment in DisableInstanceAsync for the full rationale.
|
|
await TryLogLifecycleTimeoutAsync(
|
|
user, "DeleteTimedOut", instanceId, instance.UniqueName, commandId, ex);
|
|
|
|
_logger.LogWarning(ex, "Delete of instance {Instance} timed out", instance.UniqueName);
|
|
return Result<InstanceLifecycleResponse>.Failure(
|
|
$"Delete failed: the site did not respond within {_options.LifecycleCommandTimeout}.");
|
|
}
|
|
|
|
if (response.Success)
|
|
{
|
|
// Delete means delete: remove the instance record entirely.
|
|
// Deployment records, snapshot, overrides, and connection bindings
|
|
// are removed with it (see repository implementation).
|
|
//
|
|
// DeploymentManager-004: the site has already destroyed the Instance
|
|
// Actor and removed its config. If the central record removal now
|
|
// fails (DB error / concurrency), the exception must NOT escape
|
|
// uncaught -- that would leave the central record orphaned and
|
|
// un-deletable through the normal path (a re-issued delete may fail
|
|
// because the site no longer has the instance). Surface a distinct
|
|
// failure so an operator can reconcile.
|
|
try
|
|
{
|
|
await _repository.DeleteInstanceAsync(instanceId, cancellationToken);
|
|
await _repository.SaveChangesAsync(cancellationToken);
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
_logger.LogError(ex,
|
|
"Instance {Instance} was deleted at the site, but the central record could not be " +
|
|
"removed -- the central record is now orphaned and must be reconciled manually",
|
|
instance.UniqueName);
|
|
|
|
await _auditService.LogAsync(user, "DeleteOrphaned", "Instance", instanceId.ToString(),
|
|
instance.UniqueName, new { CommandId = commandId, Error = ex.Message },
|
|
CancellationToken.None);
|
|
|
|
return Result<InstanceLifecycleResponse>.Failure(
|
|
$"The site deleted instance '{instance.UniqueName}', but the central record could not " +
|
|
$"be removed: {ex.Message}. The central record is orphaned and must be reconciled.");
|
|
}
|
|
}
|
|
|
|
await _auditService.LogAsync(user, "Delete", "Instance", instanceId.ToString(),
|
|
instance.UniqueName, new { CommandId = commandId, response.Success },
|
|
cancellationToken);
|
|
|
|
return response.Success
|
|
? Result<InstanceLifecycleResponse>.Success(response)
|
|
: Result<InstanceLifecycleResponse>.Failure(
|
|
response.ErrorMessage ?? "Delete failed. Site may be unreachable.");
|
|
}
|
|
|
|
/// <summary>
|
|
/// WP-8: Get the deployed config snapshot and compare with current
|
|
/// template-derived state. Produces both a staleness flag and — per the
|
|
/// design's "Diff View" — a structured <see cref="ConfigurationDiff"/> of
|
|
/// added/removed/changed attributes, alarms, and scripts (including data
|
|
/// connection binding changes) computed by the TemplateEngine
|
|
/// <see cref="DiffService"/>.
|
|
/// </summary>
|
|
/// <param name="instanceId">The database ID of the instance to compare.</param>
|
|
/// <param name="cancellationToken">Cancellation token for the operation.</param>
|
|
/// <returns>A task that resolves to a success result with the comparison, or a failure result if no snapshot exists.</returns>
|
|
public async Task<Result<DeploymentComparisonResult>> GetDeploymentComparisonAsync(
|
|
int instanceId,
|
|
CancellationToken cancellationToken = default)
|
|
{
|
|
var snapshot = await _repository.GetDeployedSnapshotByInstanceIdAsync(instanceId, cancellationToken);
|
|
if (snapshot == null)
|
|
return Result<DeploymentComparisonResult>.Failure("No deployed snapshot found for this instance.");
|
|
|
|
// Compute current template-derived config
|
|
var currentResult = await _flatteningPipeline.FlattenAndValidateAsync(instanceId, cancellationToken);
|
|
if (currentResult.IsFailure)
|
|
return Result<DeploymentComparisonResult>.Failure($"Cannot compute current config: {currentResult.Error}");
|
|
|
|
var currentConfig = currentResult.Value.Configuration;
|
|
var currentHash = currentResult.Value.RevisionHash;
|
|
|
|
// I-1 (latent): the snapshot's ConfigurationJson + RevisionHash froze the
|
|
// FLATTENED config at deploy time. The current config is a FRESH flatten,
|
|
// now always in native List form (#93 consolidated element-type/coercion
|
|
// into AttributeValueCodec, which emits native-form JSON arrays). A List
|
|
// attribute deployed in the OLD quoted form (e.g. ["10","20"]) therefore
|
|
// both (a) hashes differently from the native re-flatten — a spurious
|
|
// stale flag — and (b) shows a spurious Changed attribute in the diff
|
|
// (DiffService.AttributesEqual is an ordinal Value comparison). Normalize
|
|
// the deserialized snapshot's List values through AttributeValueCodec
|
|
// Decode→Encode so an old-form value becomes native form and compares
|
|
// equal to the native re-flatten, then drive BOTH the staleness hash and
|
|
// the diff off that normalized snapshot. Scalars are left untouched.
|
|
//
|
|
// DeploymentManager-007: a snapshot that cannot be deserialized (corrupt /
|
|
// older schema) still yields the frozen-hash staleness result, with a
|
|
// null diff.
|
|
var deployedRevisionHash = snapshot.RevisionHash;
|
|
ConfigurationDiff? diff = null;
|
|
try
|
|
{
|
|
var deployedConfig = JsonSerializer.Deserialize<FlattenedConfiguration>(snapshot.ConfigurationJson);
|
|
if (deployedConfig != null)
|
|
{
|
|
deployedConfig = NormalizeListAttributeValues(deployedConfig);
|
|
|
|
// Recompute the deployed-side hash from the normalized snapshot so
|
|
// an old-form List value is not flagged stale against the native
|
|
// re-flatten. For a faithfully-stored scalar-only snapshot this
|
|
// reproduces the frozen RevisionHash exactly, so behaviour is
|
|
// unchanged outside the List-normalization case.
|
|
deployedRevisionHash = _revisionHashService.ComputeHash(deployedConfig);
|
|
|
|
diff = _diffService.ComputeDiff(
|
|
deployedConfig, currentConfig, deployedRevisionHash, currentHash);
|
|
}
|
|
else
|
|
{
|
|
_logger.LogWarning(
|
|
"Deployed snapshot for instance {InstanceId} deserialized to null; " +
|
|
"returning hash-based comparison without a structured diff",
|
|
instanceId);
|
|
}
|
|
}
|
|
catch (JsonException ex)
|
|
{
|
|
_logger.LogWarning(ex,
|
|
"Could not deserialize deployed snapshot for instance {InstanceId}; " +
|
|
"returning hash-based comparison without a structured diff",
|
|
instanceId);
|
|
}
|
|
|
|
var isStale = deployedRevisionHash != currentHash;
|
|
|
|
var result = new DeploymentComparisonResult(
|
|
instanceId,
|
|
deployedRevisionHash,
|
|
currentHash,
|
|
isStale,
|
|
snapshot.DeployedAt,
|
|
diff);
|
|
|
|
return Result<DeploymentComparisonResult>.Success(result);
|
|
}
|
|
|
|
/// <summary>
|
|
/// I-1 (latent): returns a copy of <paramref name="config"/> whose
|
|
/// <see cref="DataType.List"/> attribute values have been round-tripped through
|
|
/// <see cref="AttributeValueCodec.Decode"/> → <see cref="AttributeValueCodec.Encode"/>
|
|
/// (native JSON-array form). This normalizes a value deployed in the OLD quoted
|
|
/// form (e.g. <c>["10","20"]</c>) to the native form (<c>[10,20]</c>) the current
|
|
/// flattener now produces, so the staleness hash and the structured diff do not
|
|
/// report a spurious change. Scalar / string attributes are returned unchanged
|
|
/// (only <see cref="DataType.List"/> is normalized). A value that cannot be
|
|
/// decoded (malformed JSON, bad element, or an unparseable element type) is left
|
|
/// as-is — a normalization failure must never break the read-only comparison.
|
|
/// </summary>
|
|
private ResolvedAttribute NormalizeListAttribute(ResolvedAttribute attr)
|
|
{
|
|
if (!string.Equals(attr.DataType, nameof(DataType.List), StringComparison.OrdinalIgnoreCase)
|
|
|| string.IsNullOrEmpty(attr.Value))
|
|
{
|
|
return attr;
|
|
}
|
|
|
|
if (!Enum.TryParse<DataType>(attr.ElementDataType, ignoreCase: true, out var elementType)
|
|
|| !AttributeValueCodec.IsValidElementType(elementType))
|
|
{
|
|
return attr;
|
|
}
|
|
|
|
try
|
|
{
|
|
var normalized = AttributeValueCodec.Encode(
|
|
AttributeValueCodec.Decode(attr.Value, DataType.List, elementType));
|
|
return normalized == attr.Value ? attr : attr with { Value = normalized };
|
|
}
|
|
catch (FormatException ex)
|
|
{
|
|
// Best-effort: a snapshot value that no longer round-trips is left
|
|
// untouched rather than aborting the comparison. Logged so an operator
|
|
// can investigate the stored value.
|
|
_logger.LogWarning(ex,
|
|
"Could not normalize List attribute '{Attribute}' in deployed snapshot; " +
|
|
"comparing its stored value verbatim",
|
|
attr.CanonicalName);
|
|
return attr;
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// I-1 (latent): applies <see cref="NormalizeListAttribute"/> to every attribute
|
|
/// in <paramref name="config"/>, returning the original instance unchanged when
|
|
/// no List value needed normalizing (the common scalar-only case).
|
|
/// </summary>
|
|
private FlattenedConfiguration NormalizeListAttributeValues(FlattenedConfiguration config)
|
|
{
|
|
if (config.Attributes.Count == 0)
|
|
return config;
|
|
|
|
var normalized = config.Attributes.Select(NormalizeListAttribute).ToList();
|
|
var changed = normalized.Where((a, i) => !ReferenceEquals(a, config.Attributes[i])).Any();
|
|
return changed ? config with { Attributes = normalized } : config;
|
|
}
|
|
|
|
/// <summary>
|
|
/// WP-2: Returns the current persisted <see cref="DeploymentRecord"/> for
|
|
/// the given deployment ID from the configuration database. This is a pure
|
|
/// local DB read — it does not contact the site. The query-the-site-before-
|
|
/// redeploy reconciliation (design: "Deployment Identity & Idempotency")
|
|
/// lives in <see cref="TryReconcileWithSiteAsync"/>, which
|
|
/// <see cref="DeployInstanceAsync"/> invokes on the deploy path.
|
|
/// </summary>
|
|
/// <param name="deploymentId">The unique deployment identifier to look up.</param>
|
|
/// <param name="cancellationToken">Cancellation token for the operation.</param>
|
|
/// <returns>A task that resolves to the matching deployment record, or null if none exists.</returns>
|
|
public async Task<DeploymentRecord?> GetDeploymentStatusAsync(
|
|
string deploymentId,
|
|
CancellationToken cancellationToken = default)
|
|
{
|
|
return await _repository.GetDeploymentByDeploymentIdAsync(deploymentId, cancellationToken);
|
|
}
|
|
|
|
/// <summary>
|
|
/// DeploymentManager-006: query-the-site-before-redeploy reconciliation.
|
|
///
|
|
/// The site query is issued ONLY when a prior <see cref="DeploymentRecord"/>
|
|
/// for this instance is stuck <see cref="DeploymentStatus.InProgress"/>, or
|
|
/// is <see cref="DeploymentStatus.Failed"/> due to a timeout — the only
|
|
/// cases where the site may have applied the config without central
|
|
/// learning of it. Fresh first-time deploys and redeploys after a clean
|
|
/// prior <see cref="DeploymentStatus.Success"/> skip the extra round-trip.
|
|
///
|
|
/// Reconciliation: if the site already has the TARGET revision hash, the
|
|
/// prior record is marked <see cref="DeploymentStatus.Success"/> (with its
|
|
/// <see cref="DeploymentRecord.RevisionHash"/> corrected to the target —
|
|
/// DeploymentManager-016) and returned (the caller must NOT re-send the
|
|
/// deploy). The same post-success side effects as the normal deploy path
|
|
/// are applied — instance <see cref="InstanceState.Enabled"/> and a stored
|
|
/// <see cref="DeployedConfigSnapshot"/> (DeploymentManager-015) — so central
|
|
/// and site state do not diverge. Otherwise <c>null</c> is returned and the
|
|
/// normal deploy proceeds.
|
|
///
|
|
/// Query failure: if the site is unreachable or the query times out, this
|
|
/// returns <c>null</c> (fall through to a normal deploy) — site-side
|
|
/// stale-rejection of an older revision hash is the safety net. The deploy
|
|
/// is never aborted on a failed query.
|
|
/// </summary>
|
|
private async Task<DeploymentRecord?> TryReconcileWithSiteAsync(
|
|
Instance instance,
|
|
string targetRevisionHash,
|
|
string configJson,
|
|
string currentUser,
|
|
CancellationToken cancellationToken)
|
|
{
|
|
var prior = await _repository.GetCurrentDeploymentStatusAsync(instance.Id, cancellationToken);
|
|
if (prior == null || !ShouldQuerySiteBeforeRedeploy(prior))
|
|
return null;
|
|
|
|
DeploymentStateQueryResponse response;
|
|
try
|
|
{
|
|
var siteId = await ResolveSiteIdentifierAsync(instance.SiteId, cancellationToken);
|
|
var query = new DeploymentStateQueryRequest(
|
|
Guid.NewGuid().ToString("N"), instance.UniqueName, DateTimeOffset.UtcNow);
|
|
|
|
_logger.LogInformation(
|
|
"Querying site {SiteId} for applied deployment state of instance {Instance} " +
|
|
"before re-deploy (prior record {DeploymentId} is {Status})",
|
|
siteId, instance.UniqueName, prior.DeploymentId, prior.Status);
|
|
|
|
response = await _communicationService.QueryDeploymentStateAsync(
|
|
siteId, query, cancellationToken);
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
// Query failure (site unreachable / timeout): do NOT abort. Fall
|
|
// through to a normal deploy; site-side stale-rejection of an older
|
|
// revision hash is the safety net.
|
|
_logger.LogWarning(ex,
|
|
"Site query before re-deploy of instance {Instance} failed; " +
|
|
"proceeding with normal deploy (site-side stale-rejection is the safety net)",
|
|
instance.UniqueName);
|
|
return null;
|
|
}
|
|
|
|
if (response.IsDeployed &&
|
|
string.Equals(response.AppliedRevisionHash, targetRevisionHash, StringComparison.Ordinal))
|
|
{
|
|
// The site already has the target revision — the prior deployment
|
|
// actually succeeded. Reconcile the stale record instead of
|
|
// re-sending the deploy.
|
|
_logger.LogInformation(
|
|
"Site already has target revision {RevisionHash} for instance {Instance}; " +
|
|
"marking prior deployment record {DeploymentId} Success without re-deploying",
|
|
targetRevisionHash, instance.UniqueName, prior.DeploymentId);
|
|
|
|
prior.Status = DeploymentStatus.Success;
|
|
prior.ErrorMessage = null;
|
|
prior.CompletedAt = DateTimeOffset.UtcNow;
|
|
// DeploymentManager-016: the prior record can legitimately carry a
|
|
// different (stale) revision hash than the current target. The site
|
|
// confirmed it is running the target revision, so the persisted
|
|
// record, the audit entry below, and the site must all agree.
|
|
prior.RevisionHash = targetRevisionHash;
|
|
await _repository.UpdateDeploymentRecordAsync(prior, cancellationToken);
|
|
await _repository.SaveChangesAsync(cancellationToken);
|
|
NotifyStatusChange(prior);
|
|
|
|
// DeploymentManager-015: a reconciled deployment must perform the
|
|
// SAME post-success side effects as the normal deploy path — set
|
|
// the instance State to Enabled and store/refresh the deployed
|
|
// config snapshot — otherwise the central state machine and the
|
|
// deployed-snapshot invariant diverge from what the site is running.
|
|
//
|
|
// DeploymentManager-018: the reconciliation path runs only when the
|
|
// prior record is InProgress or timeout-Failed — exactly the cases
|
|
// that survive a central failover. The in-memory operation lock is
|
|
// lost on failover, so an operator may have legitimately invoked
|
|
// Disable on the instance between the original timed-out deploy and
|
|
// this redeploy. Disable does not change the deployed config, so the
|
|
// site still reports the target revision hash. Reconciliation must
|
|
// therefore PRESERVE an intentional Disabled state instead of
|
|
// silently flipping it back to Enabled — pass forceEnabledState:
|
|
// false so the helper only promotes NotDeployed → Enabled (the
|
|
// first-deploy-timed-out case) and leaves an explicit Disabled
|
|
// alone.
|
|
await ApplyPostSuccessSideEffectsAsync(
|
|
instance, prior.DeploymentId, targetRevisionHash, configJson,
|
|
forceEnabledState: false, cancellationToken);
|
|
|
|
// DeploymentManager-020: attribute the audit row to the user driving
|
|
// THIS redeploy (the caller of DeployInstanceAsync), not the user
|
|
// who issued the original timed-out / stuck deployment. The original
|
|
// deployer is preserved in the detail object so forensics can still
|
|
// see who launched the run that reconciliation rescued.
|
|
await _auditService.LogAsync(currentUser, "DeployReconciled", "Instance",
|
|
instance.Id.ToString(), instance.UniqueName,
|
|
new
|
|
{
|
|
DeploymentId = prior.DeploymentId,
|
|
RevisionHash = targetRevisionHash,
|
|
OriginalDeployer = prior.DeployedBy
|
|
},
|
|
cancellationToken);
|
|
|
|
return prior;
|
|
}
|
|
|
|
// Site does not have the target revision (or is not deployed) — proceed
|
|
// with the normal deploy.
|
|
return null;
|
|
}
|
|
|
|
/// <summary>
|
|
/// DeploymentManager-006: the site is queried before a re-deploy only when a
|
|
/// prior record is stuck <see cref="DeploymentStatus.InProgress"/>, or is
|
|
/// <see cref="DeploymentStatus.Failed"/> because the site command timed out
|
|
/// (detected via the <see cref="TimeoutFailurePrefix"/> error-message
|
|
/// marker). All other prior states skip the query.
|
|
/// </summary>
|
|
private static bool ShouldQuerySiteBeforeRedeploy(DeploymentRecord prior) =>
|
|
prior.Status == DeploymentStatus.InProgress
|
|
|| (prior.Status == DeploymentStatus.Failed
|
|
&& prior.ErrorMessage != null
|
|
&& prior.ErrorMessage.StartsWith(TimeoutFailurePrefix, StringComparison.Ordinal));
|
|
|
|
/// <summary>
|
|
/// Post-success side effects shared by the normal deploy path and the
|
|
/// DeploymentManager-006 reconciliation path: set the instance
|
|
/// <see cref="InstanceState.Enabled"/> (WP-4) and store/refresh the
|
|
/// deployed config snapshot (WP-8). Factored into one helper so the two
|
|
/// paths cannot drift (DeploymentManager-015).
|
|
///
|
|
/// DeploymentManager-018: <paramref name="forceEnabledState"/> distinguishes
|
|
/// the two callers. The normal deploy path passes <c>true</c> — a fresh
|
|
/// successful apply legitimately puts the instance into <see cref="InstanceState.Enabled"/>
|
|
/// (the documented "Deploy on a Disabled instance also enables it" semantics
|
|
/// of <see cref="StateTransitionValidator"/>). The reconciliation path
|
|
/// passes <c>false</c>: it is reconciling a *prior* deployment that may
|
|
/// have completed before the current operator session (central failover
|
|
/// loses the in-memory operation lock, so an operator may have legitimately
|
|
/// Disabled the instance in between). On that path we only promote
|
|
/// <see cref="InstanceState.NotDeployed"/> → <see cref="InstanceState.Enabled"/>
|
|
/// (the first-deploy-timed-out case) and leave an explicit Disabled alone,
|
|
/// so reconciliation never silently undoes a Disable.
|
|
///
|
|
/// Best-effort: the deployment record's terminal <see cref="DeploymentStatus.Success"/>
|
|
/// status is already committed by the caller before this runs. A failure
|
|
/// here is logged loudly for operator reconciliation but is NOT propagated —
|
|
/// it must not flip the already-committed Success record back to Failed.
|
|
/// </summary>
|
|
private async Task ApplyPostSuccessSideEffectsAsync(
|
|
Instance instance,
|
|
string deploymentId,
|
|
string revisionHash,
|
|
string configJson,
|
|
bool forceEnabledState,
|
|
CancellationToken cancellationToken)
|
|
{
|
|
try
|
|
{
|
|
// WP-4: Update instance state to Enabled on successful deployment.
|
|
// DeploymentManager-018: on the reconciliation path
|
|
// (forceEnabledState=false) only promote NotDeployed → Enabled,
|
|
// preserving an intentional Disabled state set between the original
|
|
// timed-out deploy and the redeploy.
|
|
if (forceEnabledState || instance.State == InstanceState.NotDeployed)
|
|
{
|
|
instance.State = InstanceState.Enabled;
|
|
}
|
|
await _repository.UpdateInstanceAsync(instance, cancellationToken);
|
|
|
|
// WP-8: Store deployed config snapshot
|
|
await StoreDeployedSnapshotAsync(
|
|
instance.Id, deploymentId, revisionHash, configJson, cancellationToken);
|
|
|
|
await _repository.SaveChangesAsync(cancellationToken);
|
|
}
|
|
catch (Exception postEx)
|
|
{
|
|
_logger.LogError(postEx,
|
|
"Deployment {DeploymentId} for instance {Instance} was applied by the site and " +
|
|
"recorded Success, but post-success persistence (instance state / config snapshot) " +
|
|
"failed -- central and site state may diverge until reconciled",
|
|
deploymentId, instance.UniqueName);
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// DeploymentManager-019: write a "<Action>TimedOut" audit entry on
|
|
/// behalf of a lifecycle command (Disable / Enable / Delete) whose site
|
|
/// round-trip exceeded <see cref="DeploymentManagerOptions.LifecycleCommandTimeout"/>.
|
|
///
|
|
/// <para>
|
|
/// Mirrors the <c>DeployFailed</c> pattern in
|
|
/// <see cref="DeployInstanceAsync"/>: the audit write uses
|
|
/// <see cref="CancellationToken.None"/> so the operator's outer cancellation
|
|
/// (the usual reason this path runs) cannot also prevent the audit row from
|
|
/// being persisted. The detail object carries the lifecycle command id, the
|
|
/// timeout that fired, and the original exception message so an operator can
|
|
/// correlate the audit entry with the UI-surfaced timeout error.
|
|
/// </para>
|
|
///
|
|
/// <para>
|
|
/// Wrapped in try/catch — a failed audit write must NOT mask the underlying
|
|
/// timeout from the caller; it is logged at Warning so the operator can
|
|
/// reconcile but never thrown.
|
|
/// </para>
|
|
/// </summary>
|
|
/// <param name="user">The username who initiated the lifecycle command.</param>
|
|
/// <param name="action">The audit action name (<c>DisableTimedOut</c>, <c>EnableTimedOut</c>, or <c>DeleteTimedOut</c>).</param>
|
|
/// <param name="instanceId">The numeric instance id, recorded on the audit row.</param>
|
|
/// <param name="instanceUniqueName">The instance unique name used as the audit target name.</param>
|
|
/// <param name="commandId">The lifecycle command's correlation id, so the audit entry can be matched to logs.</param>
|
|
/// <param name="timeoutException">The captured <see cref="TimeoutException"/> or <see cref="OperationCanceledException"/>.</param>
|
|
private async Task TryLogLifecycleTimeoutAsync(
|
|
string user,
|
|
string action,
|
|
int instanceId,
|
|
string instanceUniqueName,
|
|
string commandId,
|
|
Exception timeoutException)
|
|
{
|
|
try
|
|
{
|
|
await _auditService.LogAsync(
|
|
user,
|
|
action,
|
|
"Instance",
|
|
instanceId.ToString(),
|
|
instanceUniqueName,
|
|
new
|
|
{
|
|
CommandId = commandId,
|
|
Deadline = _options.LifecycleCommandTimeout,
|
|
Error = timeoutException.Message,
|
|
},
|
|
CancellationToken.None);
|
|
}
|
|
catch (Exception auditEx)
|
|
{
|
|
// A failed audit write must not bury the timeout for the caller —
|
|
// just log so an operator can investigate the audit-pipeline issue.
|
|
_logger.LogWarning(auditEx,
|
|
"Failed to write {Action} audit entry for instance {Instance} (commandId={CommandId})",
|
|
action, instanceUniqueName, commandId);
|
|
}
|
|
}
|
|
|
|
private async Task StoreDeployedSnapshotAsync(
|
|
int instanceId,
|
|
string deploymentId,
|
|
string revisionHash,
|
|
string configJson,
|
|
CancellationToken cancellationToken)
|
|
{
|
|
var existing = await _repository.GetDeployedSnapshotByInstanceIdAsync(instanceId, cancellationToken);
|
|
if (existing != null)
|
|
{
|
|
existing.DeploymentId = deploymentId;
|
|
existing.RevisionHash = revisionHash;
|
|
existing.ConfigurationJson = configJson;
|
|
existing.DeployedAt = DateTimeOffset.UtcNow;
|
|
await _repository.UpdateDeployedSnapshotAsync(existing, cancellationToken);
|
|
}
|
|
else
|
|
{
|
|
var snapshot = new DeployedConfigSnapshot(deploymentId, revisionHash, configJson)
|
|
{
|
|
InstanceId = instanceId,
|
|
DeployedAt = DateTimeOffset.UtcNow
|
|
};
|
|
await _repository.AddDeployedSnapshotAsync(snapshot, cancellationToken);
|
|
}
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// WP-8: Result of comparing deployed vs template-derived configuration.
|
|
/// </summary>
|
|
/// <param name="Diff">
|
|
/// DeploymentManager-007: structured added/removed/changed detail for
|
|
/// attributes, alarms, and scripts. Null only when the deployed snapshot could
|
|
/// not be deserialized (corrupt / older schema), in which case
|
|
/// <see cref="IsStale"/> still reflects the hash comparison.
|
|
/// </param>
|
|
public record DeploymentComparisonResult(
|
|
int InstanceId,
|
|
string DeployedRevisionHash,
|
|
string CurrentRevisionHash,
|
|
bool IsStale,
|
|
DateTimeOffset DeployedAt,
|
|
ConfigurationDiff? Diff = null);
|