fix(deployment-manager): resolve DeploymentManager-003..011 — atomic status commit, orphan-delete handling, semaphore reclamation, structured diff, options binding, lifecycle test coverage

This commit is contained in:
Joseph Doherty
2026-05-16 21:11:24 -04:00
parent c9b236e507
commit 8c67ffad2a
8 changed files with 760 additions and 44 deletions

View File

@@ -40,6 +40,7 @@ public class DeploymentService
private readonly CommunicationService _communicationService;
private readonly OperationLockManager _lockManager;
private readonly IAuditService _auditService;
private readonly DiffService _diffService;
private readonly DeploymentManagerOptions _options;
private readonly ILogger<DeploymentService> _logger;
@@ -58,6 +59,7 @@ public class DeploymentService
CommunicationService communicationService,
OperationLockManager lockManager,
IAuditService auditService,
DiffService diffService,
IOptions<DeploymentManagerOptions> options,
ILogger<DeploymentService> logger)
{
@@ -67,6 +69,7 @@ public class DeploymentService
_communicationService = communicationService;
_lockManager = lockManager;
_auditService = auditService;
_diffService = diffService;
_options = options.Value;
_logger = logger;
}
@@ -171,24 +174,47 @@ public class DeploymentService
var response = await _communicationService.DeployInstanceAsync(siteId, command, cancellationToken);
// WP-1: Update status based on site response
// WP-1: Update status based on site response.
record.Status = response.Status;
record.ErrorMessage = response.ErrorMessage;
record.CompletedAt = DateTimeOffset.UtcNow;
// DeploymentManager-003: once the site has confirmed the apply,
// commit the deployment record's terminal status BEFORE touching
// instance state and the deployed-config snapshot. If a later write
// (instance update / snapshot store) fails, the recorded fact that
// the site succeeded must NOT be lost -- otherwise central reports a
// non-Success record while the site is running the new config.
await _repository.UpdateDeploymentRecordAsync(record, cancellationToken);
await _repository.SaveChangesAsync(cancellationToken);
if (response.Status == DeploymentStatus.Success)
{
// WP-4: Update instance state to Enabled on successful deployment
instance.State = InstanceState.Enabled;
await _repository.UpdateInstanceAsync(instance, cancellationToken);
// The site has applied the deployment. The post-success
// persistence below is best-effort: a failure here must be
// logged loudly for operator reconciliation but must not flip
// the already-committed Success record back to Failed.
try
{
// WP-4: Update instance state to Enabled on successful deployment
instance.State = InstanceState.Enabled;
await _repository.UpdateInstanceAsync(instance, cancellationToken);
// WP-8: Store deployed config snapshot
await StoreDeployedSnapshotAsync(instanceId, deploymentId, revisionHash, configJson, cancellationToken);
// WP-8: Store deployed config snapshot
await StoreDeployedSnapshotAsync(instanceId, deploymentId, revisionHash, configJson, cancellationToken);
await _repository.SaveChangesAsync(cancellationToken);
}
catch (Exception postEx)
{
_logger.LogError(postEx,
"Deployment {DeploymentId} for instance {Instance} was applied by the site and " +
"recorded Success, but post-success persistence (instance state / config snapshot) " +
"failed -- central and site state may diverge until reconciled",
deploymentId, instance.UniqueName);
}
}
await _repository.SaveChangesAsync(cancellationToken);
// Audit log
await _auditService.LogAsync(user, "Deploy", "Instance", instanceId.ToString(),
instance.UniqueName, new { DeploymentId = deploymentId, Status = record.Status.ToString() },
@@ -368,8 +394,34 @@ public class DeploymentService
// Delete means delete: remove the instance record entirely.
// Deployment records, snapshot, overrides, and connection bindings
// are removed with it (see repository implementation).
await _repository.DeleteInstanceAsync(instanceId, cancellationToken);
await _repository.SaveChangesAsync(cancellationToken);
//
// DeploymentManager-004: the site has already destroyed the Instance
// Actor and removed its config. If the central record removal now
// fails (DB error / concurrency), the exception must NOT escape
// uncaught -- that would leave the central record orphaned and
// un-deletable through the normal path (a re-issued delete may fail
// because the site no longer has the instance). Surface a distinct
// failure so an operator can reconcile.
try
{
await _repository.DeleteInstanceAsync(instanceId, cancellationToken);
await _repository.SaveChangesAsync(cancellationToken);
}
catch (Exception ex)
{
_logger.LogError(ex,
"Instance {Instance} was deleted at the site, but the central record could not be " +
"removed -- the central record is now orphaned and must be reconciled manually",
instance.UniqueName);
await _auditService.LogAsync(user, "DeleteOrphaned", "Instance", instanceId.ToString(),
instance.UniqueName, new { CommandId = commandId, Error = ex.Message },
CancellationToken.None);
return Result<InstanceLifecycleResponse>.Failure(
$"The site deleted instance '{instance.UniqueName}', but the central record could not " +
$"be removed: {ex.Message}. The central record is orphaned and must be reconciled.");
}
}
await _auditService.LogAsync(user, "Delete", "Instance", instanceId.ToString(),
@@ -383,7 +435,12 @@ public class DeploymentService
}
/// <summary>
/// WP-8: Get the deployed config snapshot and compare with current template-derived state.
/// WP-8: Get the deployed config snapshot and compare with current
/// template-derived state. Produces both a staleness flag and — per the
/// design's "Diff View" — a structured <see cref="ConfigurationDiff"/> of
/// added/removed/changed attributes, alarms, and scripts (including data
/// connection binding changes) computed by the TemplateEngine
/// <see cref="DiffService"/>.
/// </summary>
public async Task<Result<DeploymentComparisonResult>> GetDeploymentComparisonAsync(
int instanceId,
@@ -398,15 +455,47 @@ public class DeploymentService
if (currentResult.IsFailure)
return Result<DeploymentComparisonResult>.Failure($"Cannot compute current config: {currentResult.Error}");
var currentConfig = currentResult.Value.Configuration;
var currentHash = currentResult.Value.RevisionHash;
var isStale = snapshot.RevisionHash != currentHash;
// DeploymentManager-007: deserialize the deployed snapshot and run the
// TemplateEngine DiffService so the result carries real
// added/removed/changed detail, not just a hash comparison. A snapshot
// that cannot be deserialized (corrupt / older schema) still yields the
// hash-based staleness result, with a null diff.
ConfigurationDiff? diff = null;
try
{
var deployedConfig = JsonSerializer.Deserialize<FlattenedConfiguration>(snapshot.ConfigurationJson);
if (deployedConfig != null)
{
diff = _diffService.ComputeDiff(
deployedConfig, currentConfig, snapshot.RevisionHash, currentHash);
}
else
{
_logger.LogWarning(
"Deployed snapshot for instance {InstanceId} deserialized to null; " +
"returning hash-based comparison without a structured diff",
instanceId);
}
}
catch (JsonException ex)
{
_logger.LogWarning(ex,
"Could not deserialize deployed snapshot for instance {InstanceId}; " +
"returning hash-based comparison without a structured diff",
instanceId);
}
var result = new DeploymentComparisonResult(
instanceId,
snapshot.RevisionHash,
currentHash,
isStale,
snapshot.DeployedAt);
snapshot.DeployedAt,
diff);
return Result<DeploymentComparisonResult>.Success(result);
}
@@ -551,9 +640,16 @@ public class DeploymentService
/// <summary>
/// WP-8: Result of comparing deployed vs template-derived configuration.
/// </summary>
/// <param name="Diff">
/// DeploymentManager-007: structured added/removed/changed detail for
/// attributes, alarms, and scripts. Null only when the deployed snapshot could
/// not be deserialized (corrupt / older schema), in which case
/// <see cref="IsStale"/> still reflects the hash comparison.
/// </param>
public record DeploymentComparisonResult(
int InstanceId,
string DeployedRevisionHash,
string CurrentRevisionHash,
bool IsStale,
DateTimeOffset DeployedAt);
DateTimeOffset DeployedAt,
ConfigurationDiff? Diff = null);

View File

@@ -6,13 +6,34 @@ namespace ScadaLink.DeploymentManager;
/// WP-3: Per-instance operation lock. Only one mutating operation (deploy, disable, enable, delete)
/// may be in progress per instance at a time. Different instances can proceed in parallel.
///
/// Implementation: ConcurrentDictionary of SemaphoreSlim(1,1) keyed by instance unique name.
/// Lock released on completion, timeout, or failure.
/// Implementation: ConcurrentDictionary of ref-counted SemaphoreSlim(1,1) keyed by instance
/// unique name. The lock is released on completion, timeout, or failure.
/// Lost on central failover (acceptable per design -- in-progress treated as failed).
///
/// DeploymentManager-005: each entry is ref-counted. The semaphore is created on the
/// first acquire/wait, shared while there are waiters or a holder, and removed +
/// <see cref="IDisposable.Dispose"/>d when the last reference is released — so the dictionary
/// does not accumulate one kernel wait handle per distinct instance name forever.
/// </summary>
public class OperationLockManager
{
private readonly ConcurrentDictionary<string, SemaphoreSlim> _locks = new(StringComparer.Ordinal);
private readonly object _gate = new();
private readonly Dictionary<string, LockEntry> _locks = new(StringComparer.Ordinal);
/// <summary>
/// Number of lock entries currently tracked. Used for diagnostics and to
/// verify that semaphores are reclaimed (DeploymentManager-005).
/// </summary>
public int TrackedLockCount
{
get
{
lock (_gate)
{
return _locks.Count;
}
}
}
/// <summary>
/// Acquires the operation lock for the given instance. Returns a disposable that releases the lock.
@@ -20,16 +41,40 @@ public class OperationLockManager
/// </summary>
public async Task<IDisposable> AcquireAsync(string instanceUniqueName, TimeSpan timeout, CancellationToken cancellationToken = default)
{
var semaphore = _locks.GetOrAdd(instanceUniqueName, _ => new SemaphoreSlim(1, 1));
if (!await semaphore.WaitAsync(timeout, cancellationToken))
// Reserve a reference (creating the entry if needed) BEFORE waiting, so a
// concurrent waiter for the same instance shares the same semaphore and
// the entry survives until every waiter/holder has released it.
LockEntry entry;
lock (_gate)
{
throw new TimeoutException(
$"Could not acquire operation lock for instance '{instanceUniqueName}' within {timeout.TotalSeconds}s. " +
"Another mutating operation is in progress.");
if (!_locks.TryGetValue(instanceUniqueName, out entry!))
{
entry = new LockEntry();
_locks[instanceUniqueName] = entry;
}
entry.RefCount++;
}
return new LockRelease(semaphore);
try
{
if (!await entry.Semaphore.WaitAsync(timeout, cancellationToken))
{
throw new TimeoutException(
$"Could not acquire operation lock for instance '{instanceUniqueName}' within {timeout.TotalSeconds}s. " +
"Another mutating operation is in progress.");
}
}
catch (Exception) when (DropReferenceOnFailure(instanceUniqueName, entry))
{
// DropReferenceOnFailure always returns false; the filter just runs
// the cleanup so the reservation is not leaked when WaitAsync throws
// or times out (TimeoutException / OperationCanceledException). The
// exception still propagates. The semaphore was NOT entered on any
// of these paths, so only the reference is dropped.
throw;
}
return new LockRelease(this, instanceUniqueName, entry);
}
/// <summary>
@@ -37,21 +82,73 @@ public class OperationLockManager
/// </summary>
public bool IsLocked(string instanceUniqueName)
{
return _locks.TryGetValue(instanceUniqueName, out var semaphore) && semaphore.CurrentCount == 0;
lock (_gate)
{
return _locks.TryGetValue(instanceUniqueName, out var entry) && entry.Semaphore.CurrentCount == 0;
}
}
private bool DropReferenceOnFailure(string instanceUniqueName, LockEntry entry)
{
ReleaseReference(instanceUniqueName, entry, semaphoreWasEntered: false);
return false;
}
/// <summary>
/// Drops one reference to the entry. When <paramref name="semaphoreWasEntered"/>
/// is true the semaphore is released first. When the reference count reaches
/// zero the entry is removed from the dictionary and the semaphore disposed.
/// </summary>
private void ReleaseReference(string instanceUniqueName, LockEntry entry, bool semaphoreWasEntered)
{
lock (_gate)
{
// Release the semaphore (handing the lock to any waiter) and drop the
// reference under the same gate, so the dispose decision below cannot
// race with the Release on an entry that another caller is reclaiming.
if (semaphoreWasEntered)
{
entry.Semaphore.Release();
}
entry.RefCount--;
if (entry.RefCount <= 0 &&
_locks.TryGetValue(instanceUniqueName, out var current) &&
ReferenceEquals(current, entry))
{
_locks.Remove(instanceUniqueName);
entry.Semaphore.Dispose();
}
}
}
private sealed class LockEntry
{
public readonly SemaphoreSlim Semaphore = new(1, 1);
/// <summary>Number of in-flight acquires (waiters + the current holder). Guarded by <see cref="_gate"/>.</summary>
public int RefCount;
}
private sealed class LockRelease : IDisposable
{
private readonly SemaphoreSlim _semaphore;
private readonly OperationLockManager _owner;
private readonly string _instanceUniqueName;
private readonly LockEntry _entry;
private int _disposed;
public LockRelease(SemaphoreSlim semaphore) => _semaphore = semaphore;
public LockRelease(OperationLockManager owner, string instanceUniqueName, LockEntry entry)
{
_owner = owner;
_instanceUniqueName = instanceUniqueName;
_entry = entry;
}
public void Dispose()
{
if (Interlocked.CompareExchange(ref _disposed, 1, 0) == 0)
{
_semaphore.Release();
_owner.ReleaseReference(_instanceUniqueName, _entry, semaphoreWasEntered: true);
}
}
}

View File

@@ -11,6 +11,7 @@
<PackageReference Include="Microsoft.Extensions.DependencyInjection.Abstractions" />
<PackageReference Include="Microsoft.Extensions.Logging.Abstractions" />
<PackageReference Include="Microsoft.Extensions.Options" />
<PackageReference Include="Microsoft.Extensions.Options.ConfigurationExtensions" />
</ItemGroup>
<ItemGroup>

View File

@@ -1,9 +1,41 @@
using Microsoft.Extensions.Configuration;
using Microsoft.Extensions.DependencyInjection;
namespace ScadaLink.DeploymentManager;
public static class ServiceCollectionExtensions
{
/// <summary>
/// Configuration section that <see cref="DeploymentManagerOptions"/> is bound to.
/// </summary>
public const string OptionsSection = "ScadaLink:DeploymentManager";
/// <summary>
/// Registers the Deployment Manager services and binds
/// <see cref="DeploymentManagerOptions"/> to the
/// <see cref="OptionsSection"/> configuration section, consistent with the
/// Options-pattern convention ("Per-component configuration via
/// appsettings.json sections bound to options classes").
/// </summary>
public static IServiceCollection AddDeploymentManager(
this IServiceCollection services,
IConfiguration configuration)
{
ArgumentNullException.ThrowIfNull(configuration);
// DeploymentManager-008: bind the options class so the operation-lock
// and artifact-deployment timeouts are tunable via appsettings.json.
services.Configure<DeploymentManagerOptions>(configuration.GetSection(OptionsSection));
return services.AddDeploymentManager();
}
/// <summary>
/// Registers the Deployment Manager services without binding options to
/// configuration. <see cref="DeploymentManagerOptions"/> falls back to its
/// declared defaults unless configured elsewhere. Prefer the
/// <see cref="AddDeploymentManager(IServiceCollection, IConfiguration)"/>
/// overload so the options are bound to <c>appsettings.json</c>.
/// </summary>
public static IServiceCollection AddDeploymentManager(this IServiceCollection services)
{
services.AddSingleton<OperationLockManager>();