Phase 3C: Deployment pipeline & Store-and-Forward engine

Deployment Manager (WP-1–8, WP-16):
- DeploymentService: full pipeline (flatten→validate→send→track→audit)
- OperationLockManager: per-instance concurrency control
- StateTransitionValidator: Enabled/Disabled/NotDeployed transition matrix
- ArtifactDeploymentService: broadcast to all sites with per-site results
- Deployment identity (GUID + revision hash), idempotency, staleness detection
- Instance lifecycle commands (disable/enable/delete) with deduplication

Store-and-Forward (WP-9–15):
- StoreAndForwardStorage: SQLite persistence, 3 categories, no max buffer
- StoreAndForwardService: fixed-interval retry, transient-only buffering, parking
- ReplicationService: async best-effort to standby (fire-and-forget)
- Parked message management (query/retry/discard from central)
- Messages survive instance deletion, S&F drains on disable

620 tests pass (+79 new), zero warnings.
This commit is contained in:
Joseph Doherty
2026-03-16 21:27:18 -04:00
parent b75bf52fb4
commit 6ea38faa6f
40 changed files with 3289 additions and 29 deletions
@@ -0,0 +1,393 @@
using System.Text.Json;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using ScadaLink.Commons.Entities.Deployment;
using ScadaLink.Commons.Entities.Instances;
using ScadaLink.Commons.Interfaces.Repositories;
using ScadaLink.Commons.Interfaces.Services;
using ScadaLink.Commons.Messages.Deployment;
using ScadaLink.Commons.Messages.Lifecycle;
using ScadaLink.Commons.Types;
using ScadaLink.Commons.Types.Enums;
using ScadaLink.Commons.Types.Flattening;
using ScadaLink.Communication;
using ScadaLink.TemplateEngine.Flattening;
using ScadaLink.TemplateEngine.Validation;
namespace ScadaLink.DeploymentManager;
/// <summary>
/// WP-1: Central-side deployment orchestration service.
/// Coordinates the full deployment pipeline:
/// 1. Validate instance state transition (WP-4)
/// 2. Acquire per-instance operation lock (WP-3)
/// 3. Flatten configuration via TemplateEngine (captures template state at time of flatten -- WP-16)
/// 4. Validate flattened configuration
/// 5. Compute revision hash and diff
/// 6. Send DeployInstanceCommand to site via CommunicationService
/// 7. Track deployment status with optimistic concurrency (WP-4)
/// 8. Store deployed config snapshot (WP-8)
/// 9. Audit log all actions
///
/// WP-2: Each deployment has a unique deployment ID (GUID) + revision hash.
/// WP-16: Template state captured at flatten time -- last-write-wins on templates is safe.
/// </summary>
public class DeploymentService
{
private readonly IDeploymentManagerRepository _repository;
private readonly IFlatteningPipeline _flatteningPipeline;
private readonly CommunicationService _communicationService;
private readonly OperationLockManager _lockManager;
private readonly IAuditService _auditService;
private readonly DeploymentManagerOptions _options;
private readonly ILogger<DeploymentService> _logger;
public DeploymentService(
IDeploymentManagerRepository repository,
IFlatteningPipeline flatteningPipeline,
CommunicationService communicationService,
OperationLockManager lockManager,
IAuditService auditService,
IOptions<DeploymentManagerOptions> options,
ILogger<DeploymentService> logger)
{
_repository = repository;
_flatteningPipeline = flatteningPipeline;
_communicationService = communicationService;
_lockManager = lockManager;
_auditService = auditService;
_options = options.Value;
_logger = logger;
}
/// <summary>
/// WP-1: Deploy an instance to its site.
/// WP-2: Generates unique deployment ID, computes revision hash.
/// WP-4: Validates state transitions, uses optimistic concurrency.
/// WP-5: Site-side apply is all-or-nothing (handled by DeploymentManagerActor).
/// WP-8: Stores deployed config snapshot on success.
/// WP-16: Captures template state at time of flatten.
/// </summary>
public async Task<Result<DeploymentRecord>> DeployInstanceAsync(
int instanceId,
string user,
CancellationToken cancellationToken = default)
{
// Load instance
var instance = await _repository.GetInstanceByIdAsync(instanceId, cancellationToken);
if (instance == null)
return Result<DeploymentRecord>.Failure($"Instance with ID {instanceId} not found.");
// WP-4: Validate state transition
var transitionError = StateTransitionValidator.ValidateTransition(instance.State, "deploy");
if (transitionError != null)
return Result<DeploymentRecord>.Failure(transitionError);
// WP-3: Acquire per-instance operation lock
using var lockHandle = await _lockManager.AcquireAsync(
instance.UniqueName, _options.OperationLockTimeout, cancellationToken);
// WP-2: Generate unique deployment ID
var deploymentId = Guid.NewGuid().ToString("N");
// WP-1/16: Flatten configuration (captures template state at this point in time)
var flattenResult = await _flatteningPipeline.FlattenAndValidateAsync(instanceId, cancellationToken);
if (flattenResult.IsFailure)
return Result<DeploymentRecord>.Failure($"Validation failed: {flattenResult.Error}");
var flattenedConfig = flattenResult.Value.Configuration;
var revisionHash = flattenResult.Value.RevisionHash;
var validationResult = flattenResult.Value.Validation;
if (!validationResult.IsValid)
{
var errors = string.Join("; ", validationResult.Errors.Select(e => e.Message));
return Result<DeploymentRecord>.Failure($"Pre-deployment validation failed: {errors}");
}
// Serialize for transmission
var configJson = JsonSerializer.Serialize(flattenedConfig);
// WP-4: Create deployment record with Pending status
var record = new DeploymentRecord(deploymentId, user)
{
InstanceId = instanceId,
Status = DeploymentStatus.Pending,
RevisionHash = revisionHash,
DeployedAt = DateTimeOffset.UtcNow
};
await _repository.AddDeploymentRecordAsync(record, cancellationToken);
await _repository.SaveChangesAsync(cancellationToken);
// Update status to InProgress
record.Status = DeploymentStatus.InProgress;
await _repository.UpdateDeploymentRecordAsync(record, cancellationToken);
await _repository.SaveChangesAsync(cancellationToken);
try
{
// WP-1: Send to site via CommunicationService
var siteId = instance.SiteId.ToString();
var command = new DeployInstanceCommand(
deploymentId, instance.UniqueName, revisionHash, configJson, user, DateTimeOffset.UtcNow);
_logger.LogInformation(
"Sending deployment {DeploymentId} for instance {Instance} to site {SiteId}",
deploymentId, instance.UniqueName, siteId);
var response = await _communicationService.DeployInstanceAsync(siteId, command, cancellationToken);
// WP-1: Update status based on site response
record.Status = response.Status;
record.ErrorMessage = response.ErrorMessage;
record.CompletedAt = DateTimeOffset.UtcNow;
await _repository.UpdateDeploymentRecordAsync(record, cancellationToken);
if (response.Status == DeploymentStatus.Success)
{
// WP-4: Update instance state to Enabled on successful deployment
instance.State = InstanceState.Enabled;
await _repository.UpdateInstanceAsync(instance, cancellationToken);
// WP-8: Store deployed config snapshot
await StoreDeployedSnapshotAsync(instanceId, deploymentId, revisionHash, configJson, cancellationToken);
}
await _repository.SaveChangesAsync(cancellationToken);
// Audit log
await _auditService.LogAsync(user, "Deploy", "Instance", instanceId.ToString(),
instance.UniqueName, new { DeploymentId = deploymentId, Status = record.Status.ToString() },
cancellationToken);
_logger.LogInformation(
"Deployment {DeploymentId} for instance {Instance}: {Status}",
deploymentId, instance.UniqueName, record.Status);
return record.Status == DeploymentStatus.Success
? Result<DeploymentRecord>.Success(record)
: Result<DeploymentRecord>.Failure(
$"Deployment failed: {response.ErrorMessage ?? "Unknown error"}");
}
catch (Exception ex) when (ex is TimeoutException or OperationCanceledException)
{
record.Status = DeploymentStatus.Failed;
record.ErrorMessage = $"Communication failure: {ex.Message}";
record.CompletedAt = DateTimeOffset.UtcNow;
await _repository.UpdateDeploymentRecordAsync(record, cancellationToken);
await _repository.SaveChangesAsync(cancellationToken);
await _auditService.LogAsync(user, "DeployFailed", "Instance", instanceId.ToString(),
instance.UniqueName, new { DeploymentId = deploymentId, Error = ex.Message },
cancellationToken);
return Result<DeploymentRecord>.Failure($"Deployment timed out: {ex.Message}");
}
}
/// <summary>
/// WP-6: Disable an instance. Stops Instance Actor, retains config, S&amp;F drains.
/// </summary>
public async Task<Result<InstanceLifecycleResponse>> DisableInstanceAsync(
int instanceId,
string user,
CancellationToken cancellationToken = default)
{
var instance = await _repository.GetInstanceByIdAsync(instanceId, cancellationToken);
if (instance == null)
return Result<InstanceLifecycleResponse>.Failure($"Instance with ID {instanceId} not found.");
var transitionError = StateTransitionValidator.ValidateTransition(instance.State, "disable");
if (transitionError != null)
return Result<InstanceLifecycleResponse>.Failure(transitionError);
using var lockHandle = await _lockManager.AcquireAsync(
instance.UniqueName, _options.OperationLockTimeout, cancellationToken);
var commandId = Guid.NewGuid().ToString("N");
var siteId = instance.SiteId.ToString();
var command = new DisableInstanceCommand(commandId, instance.UniqueName, DateTimeOffset.UtcNow);
var response = await _communicationService.DisableInstanceAsync(siteId, command, cancellationToken);
if (response.Success)
{
instance.State = InstanceState.Disabled;
await _repository.UpdateInstanceAsync(instance, cancellationToken);
await _repository.SaveChangesAsync(cancellationToken);
}
await _auditService.LogAsync(user, "Disable", "Instance", instanceId.ToString(),
instance.UniqueName, new { CommandId = commandId, response.Success },
cancellationToken);
return response.Success
? Result<InstanceLifecycleResponse>.Success(response)
: Result<InstanceLifecycleResponse>.Failure(response.ErrorMessage ?? "Disable failed.");
}
/// <summary>
/// WP-6: Enable an instance. Re-creates Instance Actor from stored config.
/// </summary>
public async Task<Result<InstanceLifecycleResponse>> EnableInstanceAsync(
int instanceId,
string user,
CancellationToken cancellationToken = default)
{
var instance = await _repository.GetInstanceByIdAsync(instanceId, cancellationToken);
if (instance == null)
return Result<InstanceLifecycleResponse>.Failure($"Instance with ID {instanceId} not found.");
var transitionError = StateTransitionValidator.ValidateTransition(instance.State, "enable");
if (transitionError != null)
return Result<InstanceLifecycleResponse>.Failure(transitionError);
using var lockHandle = await _lockManager.AcquireAsync(
instance.UniqueName, _options.OperationLockTimeout, cancellationToken);
var commandId = Guid.NewGuid().ToString("N");
var siteId = instance.SiteId.ToString();
var command = new EnableInstanceCommand(commandId, instance.UniqueName, DateTimeOffset.UtcNow);
var response = await _communicationService.EnableInstanceAsync(siteId, command, cancellationToken);
if (response.Success)
{
instance.State = InstanceState.Enabled;
await _repository.UpdateInstanceAsync(instance, cancellationToken);
await _repository.SaveChangesAsync(cancellationToken);
}
await _auditService.LogAsync(user, "Enable", "Instance", instanceId.ToString(),
instance.UniqueName, new { CommandId = commandId, response.Success },
cancellationToken);
return response.Success
? Result<InstanceLifecycleResponse>.Success(response)
: Result<InstanceLifecycleResponse>.Failure(response.ErrorMessage ?? "Enable failed.");
}
/// <summary>
/// WP-6: Delete an instance. Stops actor, removes config. S&amp;F NOT cleared.
/// Delete fails if site unreachable (30s timeout via CommunicationOptions).
/// </summary>
public async Task<Result<InstanceLifecycleResponse>> DeleteInstanceAsync(
int instanceId,
string user,
CancellationToken cancellationToken = default)
{
var instance = await _repository.GetInstanceByIdAsync(instanceId, cancellationToken);
if (instance == null)
return Result<InstanceLifecycleResponse>.Failure($"Instance with ID {instanceId} not found.");
var transitionError = StateTransitionValidator.ValidateTransition(instance.State, "delete");
if (transitionError != null)
return Result<InstanceLifecycleResponse>.Failure(transitionError);
using var lockHandle = await _lockManager.AcquireAsync(
instance.UniqueName, _options.OperationLockTimeout, cancellationToken);
var commandId = Guid.NewGuid().ToString("N");
var siteId = instance.SiteId.ToString();
var command = new DeleteInstanceCommand(commandId, instance.UniqueName, DateTimeOffset.UtcNow);
var response = await _communicationService.DeleteInstanceAsync(siteId, command, cancellationToken);
if (response.Success)
{
// Remove deployed snapshot
await _repository.DeleteDeployedSnapshotAsync(instanceId, cancellationToken);
// Set state to NotDeployed (or the instance record could be deleted entirely by higher layers)
instance.State = InstanceState.NotDeployed;
await _repository.UpdateInstanceAsync(instance, cancellationToken);
await _repository.SaveChangesAsync(cancellationToken);
}
await _auditService.LogAsync(user, "Delete", "Instance", instanceId.ToString(),
instance.UniqueName, new { CommandId = commandId, response.Success },
cancellationToken);
return response.Success
? Result<InstanceLifecycleResponse>.Success(response)
: Result<InstanceLifecycleResponse>.Failure(
response.ErrorMessage ?? "Delete failed. Site may be unreachable.");
}
/// <summary>
/// WP-8: Get the deployed config snapshot and compare with current template-derived state.
/// </summary>
public async Task<Result<DeploymentComparisonResult>> GetDeploymentComparisonAsync(
int instanceId,
CancellationToken cancellationToken = default)
{
var snapshot = await _repository.GetDeployedSnapshotByInstanceIdAsync(instanceId, cancellationToken);
if (snapshot == null)
return Result<DeploymentComparisonResult>.Failure("No deployed snapshot found for this instance.");
// Compute current template-derived config
var currentResult = await _flatteningPipeline.FlattenAndValidateAsync(instanceId, cancellationToken);
if (currentResult.IsFailure)
return Result<DeploymentComparisonResult>.Failure($"Cannot compute current config: {currentResult.Error}");
var currentHash = currentResult.Value.RevisionHash;
var isStale = snapshot.RevisionHash != currentHash;
var result = new DeploymentComparisonResult(
instanceId,
snapshot.RevisionHash,
currentHash,
isStale,
snapshot.DeployedAt);
return Result<DeploymentComparisonResult>.Success(result);
}
/// <summary>
/// WP-2: After failover/timeout, query site for current deployment state before re-deploying.
/// </summary>
public async Task<DeploymentRecord?> GetDeploymentStatusAsync(
string deploymentId,
CancellationToken cancellationToken = default)
{
return await _repository.GetDeploymentByDeploymentIdAsync(deploymentId, cancellationToken);
}
private async Task StoreDeployedSnapshotAsync(
int instanceId,
string deploymentId,
string revisionHash,
string configJson,
CancellationToken cancellationToken)
{
var existing = await _repository.GetDeployedSnapshotByInstanceIdAsync(instanceId, cancellationToken);
if (existing != null)
{
existing.DeploymentId = deploymentId;
existing.RevisionHash = revisionHash;
existing.ConfigurationJson = configJson;
existing.DeployedAt = DateTimeOffset.UtcNow;
await _repository.UpdateDeployedSnapshotAsync(existing, cancellationToken);
}
else
{
var snapshot = new DeployedConfigSnapshot(deploymentId, revisionHash, configJson)
{
InstanceId = instanceId,
DeployedAt = DateTimeOffset.UtcNow
};
await _repository.AddDeployedSnapshotAsync(snapshot, cancellationToken);
}
}
}
/// <summary>
/// WP-8: Result of comparing deployed vs template-derived configuration.
/// </summary>
public record DeploymentComparisonResult(
int InstanceId,
string DeployedRevisionHash,
string CurrentRevisionHash,
bool IsStale,
DateTimeOffset DeployedAt);