fix(deployment-manager): resolve DeploymentManager-003..011 — atomic status commit, orphan-delete handling, semaphore reclamation, structured diff, options binding, lifecycle test coverage
This commit is contained in:
@@ -40,6 +40,7 @@ public class DeploymentService
|
||||
private readonly CommunicationService _communicationService;
|
||||
private readonly OperationLockManager _lockManager;
|
||||
private readonly IAuditService _auditService;
|
||||
private readonly DiffService _diffService;
|
||||
private readonly DeploymentManagerOptions _options;
|
||||
private readonly ILogger<DeploymentService> _logger;
|
||||
|
||||
@@ -58,6 +59,7 @@ public class DeploymentService
|
||||
CommunicationService communicationService,
|
||||
OperationLockManager lockManager,
|
||||
IAuditService auditService,
|
||||
DiffService diffService,
|
||||
IOptions<DeploymentManagerOptions> options,
|
||||
ILogger<DeploymentService> logger)
|
||||
{
|
||||
@@ -67,6 +69,7 @@ public class DeploymentService
|
||||
_communicationService = communicationService;
|
||||
_lockManager = lockManager;
|
||||
_auditService = auditService;
|
||||
_diffService = diffService;
|
||||
_options = options.Value;
|
||||
_logger = logger;
|
||||
}
|
||||
@@ -171,24 +174,47 @@ public class DeploymentService
|
||||
|
||||
var response = await _communicationService.DeployInstanceAsync(siteId, command, cancellationToken);
|
||||
|
||||
// WP-1: Update status based on site response
|
||||
// WP-1: Update status based on site response.
|
||||
record.Status = response.Status;
|
||||
record.ErrorMessage = response.ErrorMessage;
|
||||
record.CompletedAt = DateTimeOffset.UtcNow;
|
||||
|
||||
// DeploymentManager-003: once the site has confirmed the apply,
|
||||
// commit the deployment record's terminal status BEFORE touching
|
||||
// instance state and the deployed-config snapshot. If a later write
|
||||
// (instance update / snapshot store) fails, the recorded fact that
|
||||
// the site succeeded must NOT be lost -- otherwise central reports a
|
||||
// non-Success record while the site is running the new config.
|
||||
await _repository.UpdateDeploymentRecordAsync(record, cancellationToken);
|
||||
await _repository.SaveChangesAsync(cancellationToken);
|
||||
|
||||
if (response.Status == DeploymentStatus.Success)
|
||||
{
|
||||
// WP-4: Update instance state to Enabled on successful deployment
|
||||
instance.State = InstanceState.Enabled;
|
||||
await _repository.UpdateInstanceAsync(instance, cancellationToken);
|
||||
// The site has applied the deployment. The post-success
|
||||
// persistence below is best-effort: a failure here must be
|
||||
// logged loudly for operator reconciliation but must not flip
|
||||
// the already-committed Success record back to Failed.
|
||||
try
|
||||
{
|
||||
// WP-4: Update instance state to Enabled on successful deployment
|
||||
instance.State = InstanceState.Enabled;
|
||||
await _repository.UpdateInstanceAsync(instance, cancellationToken);
|
||||
|
||||
// WP-8: Store deployed config snapshot
|
||||
await StoreDeployedSnapshotAsync(instanceId, deploymentId, revisionHash, configJson, cancellationToken);
|
||||
// WP-8: Store deployed config snapshot
|
||||
await StoreDeployedSnapshotAsync(instanceId, deploymentId, revisionHash, configJson, cancellationToken);
|
||||
|
||||
await _repository.SaveChangesAsync(cancellationToken);
|
||||
}
|
||||
catch (Exception postEx)
|
||||
{
|
||||
_logger.LogError(postEx,
|
||||
"Deployment {DeploymentId} for instance {Instance} was applied by the site and " +
|
||||
"recorded Success, but post-success persistence (instance state / config snapshot) " +
|
||||
"failed -- central and site state may diverge until reconciled",
|
||||
deploymentId, instance.UniqueName);
|
||||
}
|
||||
}
|
||||
|
||||
await _repository.SaveChangesAsync(cancellationToken);
|
||||
|
||||
// Audit log
|
||||
await _auditService.LogAsync(user, "Deploy", "Instance", instanceId.ToString(),
|
||||
instance.UniqueName, new { DeploymentId = deploymentId, Status = record.Status.ToString() },
|
||||
@@ -368,8 +394,34 @@ public class DeploymentService
|
||||
// Delete means delete: remove the instance record entirely.
|
||||
// Deployment records, snapshot, overrides, and connection bindings
|
||||
// are removed with it (see repository implementation).
|
||||
await _repository.DeleteInstanceAsync(instanceId, cancellationToken);
|
||||
await _repository.SaveChangesAsync(cancellationToken);
|
||||
//
|
||||
// DeploymentManager-004: the site has already destroyed the Instance
|
||||
// Actor and removed its config. If the central record removal now
|
||||
// fails (DB error / concurrency), the exception must NOT escape
|
||||
// uncaught -- that would leave the central record orphaned and
|
||||
// un-deletable through the normal path (a re-issued delete may fail
|
||||
// because the site no longer has the instance). Surface a distinct
|
||||
// failure so an operator can reconcile.
|
||||
try
|
||||
{
|
||||
await _repository.DeleteInstanceAsync(instanceId, cancellationToken);
|
||||
await _repository.SaveChangesAsync(cancellationToken);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex,
|
||||
"Instance {Instance} was deleted at the site, but the central record could not be " +
|
||||
"removed -- the central record is now orphaned and must be reconciled manually",
|
||||
instance.UniqueName);
|
||||
|
||||
await _auditService.LogAsync(user, "DeleteOrphaned", "Instance", instanceId.ToString(),
|
||||
instance.UniqueName, new { CommandId = commandId, Error = ex.Message },
|
||||
CancellationToken.None);
|
||||
|
||||
return Result<InstanceLifecycleResponse>.Failure(
|
||||
$"The site deleted instance '{instance.UniqueName}', but the central record could not " +
|
||||
$"be removed: {ex.Message}. The central record is orphaned and must be reconciled.");
|
||||
}
|
||||
}
|
||||
|
||||
await _auditService.LogAsync(user, "Delete", "Instance", instanceId.ToString(),
|
||||
@@ -383,7 +435,12 @@ public class DeploymentService
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// WP-8: Get the deployed config snapshot and compare with current template-derived state.
|
||||
/// WP-8: Get the deployed config snapshot and compare with current
|
||||
/// template-derived state. Produces both a staleness flag and — per the
|
||||
/// design's "Diff View" — a structured <see cref="ConfigurationDiff"/> of
|
||||
/// added/removed/changed attributes, alarms, and scripts (including data
|
||||
/// connection binding changes) computed by the TemplateEngine
|
||||
/// <see cref="DiffService"/>.
|
||||
/// </summary>
|
||||
public async Task<Result<DeploymentComparisonResult>> GetDeploymentComparisonAsync(
|
||||
int instanceId,
|
||||
@@ -398,15 +455,47 @@ public class DeploymentService
|
||||
if (currentResult.IsFailure)
|
||||
return Result<DeploymentComparisonResult>.Failure($"Cannot compute current config: {currentResult.Error}");
|
||||
|
||||
var currentConfig = currentResult.Value.Configuration;
|
||||
var currentHash = currentResult.Value.RevisionHash;
|
||||
var isStale = snapshot.RevisionHash != currentHash;
|
||||
|
||||
// DeploymentManager-007: deserialize the deployed snapshot and run the
|
||||
// TemplateEngine DiffService so the result carries real
|
||||
// added/removed/changed detail, not just a hash comparison. A snapshot
|
||||
// that cannot be deserialized (corrupt / older schema) still yields the
|
||||
// hash-based staleness result, with a null diff.
|
||||
ConfigurationDiff? diff = null;
|
||||
try
|
||||
{
|
||||
var deployedConfig = JsonSerializer.Deserialize<FlattenedConfiguration>(snapshot.ConfigurationJson);
|
||||
if (deployedConfig != null)
|
||||
{
|
||||
diff = _diffService.ComputeDiff(
|
||||
deployedConfig, currentConfig, snapshot.RevisionHash, currentHash);
|
||||
}
|
||||
else
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"Deployed snapshot for instance {InstanceId} deserialized to null; " +
|
||||
"returning hash-based comparison without a structured diff",
|
||||
instanceId);
|
||||
}
|
||||
}
|
||||
catch (JsonException ex)
|
||||
{
|
||||
_logger.LogWarning(ex,
|
||||
"Could not deserialize deployed snapshot for instance {InstanceId}; " +
|
||||
"returning hash-based comparison without a structured diff",
|
||||
instanceId);
|
||||
}
|
||||
|
||||
var result = new DeploymentComparisonResult(
|
||||
instanceId,
|
||||
snapshot.RevisionHash,
|
||||
currentHash,
|
||||
isStale,
|
||||
snapshot.DeployedAt);
|
||||
snapshot.DeployedAt,
|
||||
diff);
|
||||
|
||||
return Result<DeploymentComparisonResult>.Success(result);
|
||||
}
|
||||
@@ -551,9 +640,16 @@ public class DeploymentService
|
||||
/// <summary>
|
||||
/// WP-8: Result of comparing deployed vs template-derived configuration.
|
||||
/// </summary>
|
||||
/// <param name="Diff">
|
||||
/// DeploymentManager-007: structured added/removed/changed detail for
|
||||
/// attributes, alarms, and scripts. Null only when the deployed snapshot could
|
||||
/// not be deserialized (corrupt / older schema), in which case
|
||||
/// <see cref="IsStale"/> still reflects the hash comparison.
|
||||
/// </param>
|
||||
public record DeploymentComparisonResult(
|
||||
int InstanceId,
|
||||
string DeployedRevisionHash,
|
||||
string CurrentRevisionHash,
|
||||
bool IsStale,
|
||||
DateTimeOffset DeployedAt);
|
||||
DateTimeOffset DeployedAt,
|
||||
ConfigurationDiff? Diff = null);
|
||||
|
||||
@@ -6,13 +6,34 @@ namespace ScadaLink.DeploymentManager;
|
||||
/// WP-3: Per-instance operation lock. Only one mutating operation (deploy, disable, enable, delete)
|
||||
/// may be in progress per instance at a time. Different instances can proceed in parallel.
|
||||
///
|
||||
/// Implementation: ConcurrentDictionary of SemaphoreSlim(1,1) keyed by instance unique name.
|
||||
/// Lock released on completion, timeout, or failure.
|
||||
/// Implementation: ConcurrentDictionary of ref-counted SemaphoreSlim(1,1) keyed by instance
|
||||
/// unique name. The lock is released on completion, timeout, or failure.
|
||||
/// Lost on central failover (acceptable per design -- in-progress treated as failed).
|
||||
///
|
||||
/// DeploymentManager-005: each entry is ref-counted. The semaphore is created on the
|
||||
/// first acquire/wait, shared while there are waiters or a holder, and removed +
|
||||
/// <see cref="IDisposable.Dispose"/>d when the last reference is released — so the dictionary
|
||||
/// does not accumulate one kernel wait handle per distinct instance name forever.
|
||||
/// </summary>
|
||||
public class OperationLockManager
|
||||
{
|
||||
private readonly ConcurrentDictionary<string, SemaphoreSlim> _locks = new(StringComparer.Ordinal);
|
||||
private readonly object _gate = new();
|
||||
private readonly Dictionary<string, LockEntry> _locks = new(StringComparer.Ordinal);
|
||||
|
||||
/// <summary>
|
||||
/// Number of lock entries currently tracked. Used for diagnostics and to
|
||||
/// verify that semaphores are reclaimed (DeploymentManager-005).
|
||||
/// </summary>
|
||||
public int TrackedLockCount
|
||||
{
|
||||
get
|
||||
{
|
||||
lock (_gate)
|
||||
{
|
||||
return _locks.Count;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Acquires the operation lock for the given instance. Returns a disposable that releases the lock.
|
||||
@@ -20,16 +41,40 @@ public class OperationLockManager
|
||||
/// </summary>
|
||||
public async Task<IDisposable> AcquireAsync(string instanceUniqueName, TimeSpan timeout, CancellationToken cancellationToken = default)
|
||||
{
|
||||
var semaphore = _locks.GetOrAdd(instanceUniqueName, _ => new SemaphoreSlim(1, 1));
|
||||
|
||||
if (!await semaphore.WaitAsync(timeout, cancellationToken))
|
||||
// Reserve a reference (creating the entry if needed) BEFORE waiting, so a
|
||||
// concurrent waiter for the same instance shares the same semaphore and
|
||||
// the entry survives until every waiter/holder has released it.
|
||||
LockEntry entry;
|
||||
lock (_gate)
|
||||
{
|
||||
throw new TimeoutException(
|
||||
$"Could not acquire operation lock for instance '{instanceUniqueName}' within {timeout.TotalSeconds}s. " +
|
||||
"Another mutating operation is in progress.");
|
||||
if (!_locks.TryGetValue(instanceUniqueName, out entry!))
|
||||
{
|
||||
entry = new LockEntry();
|
||||
_locks[instanceUniqueName] = entry;
|
||||
}
|
||||
entry.RefCount++;
|
||||
}
|
||||
|
||||
return new LockRelease(semaphore);
|
||||
try
|
||||
{
|
||||
if (!await entry.Semaphore.WaitAsync(timeout, cancellationToken))
|
||||
{
|
||||
throw new TimeoutException(
|
||||
$"Could not acquire operation lock for instance '{instanceUniqueName}' within {timeout.TotalSeconds}s. " +
|
||||
"Another mutating operation is in progress.");
|
||||
}
|
||||
}
|
||||
catch (Exception) when (DropReferenceOnFailure(instanceUniqueName, entry))
|
||||
{
|
||||
// DropReferenceOnFailure always returns false; the filter just runs
|
||||
// the cleanup so the reservation is not leaked when WaitAsync throws
|
||||
// or times out (TimeoutException / OperationCanceledException). The
|
||||
// exception still propagates. The semaphore was NOT entered on any
|
||||
// of these paths, so only the reference is dropped.
|
||||
throw;
|
||||
}
|
||||
|
||||
return new LockRelease(this, instanceUniqueName, entry);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
@@ -37,21 +82,73 @@ public class OperationLockManager
|
||||
/// </summary>
|
||||
public bool IsLocked(string instanceUniqueName)
|
||||
{
|
||||
return _locks.TryGetValue(instanceUniqueName, out var semaphore) && semaphore.CurrentCount == 0;
|
||||
lock (_gate)
|
||||
{
|
||||
return _locks.TryGetValue(instanceUniqueName, out var entry) && entry.Semaphore.CurrentCount == 0;
|
||||
}
|
||||
}
|
||||
|
||||
private bool DropReferenceOnFailure(string instanceUniqueName, LockEntry entry)
|
||||
{
|
||||
ReleaseReference(instanceUniqueName, entry, semaphoreWasEntered: false);
|
||||
return false;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Drops one reference to the entry. When <paramref name="semaphoreWasEntered"/>
|
||||
/// is true the semaphore is released first. When the reference count reaches
|
||||
/// zero the entry is removed from the dictionary and the semaphore disposed.
|
||||
/// </summary>
|
||||
private void ReleaseReference(string instanceUniqueName, LockEntry entry, bool semaphoreWasEntered)
|
||||
{
|
||||
lock (_gate)
|
||||
{
|
||||
// Release the semaphore (handing the lock to any waiter) and drop the
|
||||
// reference under the same gate, so the dispose decision below cannot
|
||||
// race with the Release on an entry that another caller is reclaiming.
|
||||
if (semaphoreWasEntered)
|
||||
{
|
||||
entry.Semaphore.Release();
|
||||
}
|
||||
|
||||
entry.RefCount--;
|
||||
if (entry.RefCount <= 0 &&
|
||||
_locks.TryGetValue(instanceUniqueName, out var current) &&
|
||||
ReferenceEquals(current, entry))
|
||||
{
|
||||
_locks.Remove(instanceUniqueName);
|
||||
entry.Semaphore.Dispose();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private sealed class LockEntry
|
||||
{
|
||||
public readonly SemaphoreSlim Semaphore = new(1, 1);
|
||||
|
||||
/// <summary>Number of in-flight acquires (waiters + the current holder). Guarded by <see cref="_gate"/>.</summary>
|
||||
public int RefCount;
|
||||
}
|
||||
|
||||
private sealed class LockRelease : IDisposable
|
||||
{
|
||||
private readonly SemaphoreSlim _semaphore;
|
||||
private readonly OperationLockManager _owner;
|
||||
private readonly string _instanceUniqueName;
|
||||
private readonly LockEntry _entry;
|
||||
private int _disposed;
|
||||
|
||||
public LockRelease(SemaphoreSlim semaphore) => _semaphore = semaphore;
|
||||
public LockRelease(OperationLockManager owner, string instanceUniqueName, LockEntry entry)
|
||||
{
|
||||
_owner = owner;
|
||||
_instanceUniqueName = instanceUniqueName;
|
||||
_entry = entry;
|
||||
}
|
||||
|
||||
public void Dispose()
|
||||
{
|
||||
if (Interlocked.CompareExchange(ref _disposed, 1, 0) == 0)
|
||||
{
|
||||
_semaphore.Release();
|
||||
_owner.ReleaseReference(_instanceUniqueName, _entry, semaphoreWasEntered: true);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -11,6 +11,7 @@
|
||||
<PackageReference Include="Microsoft.Extensions.DependencyInjection.Abstractions" />
|
||||
<PackageReference Include="Microsoft.Extensions.Logging.Abstractions" />
|
||||
<PackageReference Include="Microsoft.Extensions.Options" />
|
||||
<PackageReference Include="Microsoft.Extensions.Options.ConfigurationExtensions" />
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
|
||||
@@ -1,9 +1,41 @@
|
||||
using Microsoft.Extensions.Configuration;
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
|
||||
namespace ScadaLink.DeploymentManager;
|
||||
|
||||
public static class ServiceCollectionExtensions
|
||||
{
|
||||
/// <summary>
|
||||
/// Configuration section that <see cref="DeploymentManagerOptions"/> is bound to.
|
||||
/// </summary>
|
||||
public const string OptionsSection = "ScadaLink:DeploymentManager";
|
||||
|
||||
/// <summary>
|
||||
/// Registers the Deployment Manager services and binds
|
||||
/// <see cref="DeploymentManagerOptions"/> to the
|
||||
/// <see cref="OptionsSection"/> configuration section, consistent with the
|
||||
/// Options-pattern convention ("Per-component configuration via
|
||||
/// appsettings.json sections bound to options classes").
|
||||
/// </summary>
|
||||
public static IServiceCollection AddDeploymentManager(
|
||||
this IServiceCollection services,
|
||||
IConfiguration configuration)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(configuration);
|
||||
|
||||
// DeploymentManager-008: bind the options class so the operation-lock
|
||||
// and artifact-deployment timeouts are tunable via appsettings.json.
|
||||
services.Configure<DeploymentManagerOptions>(configuration.GetSection(OptionsSection));
|
||||
return services.AddDeploymentManager();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Registers the Deployment Manager services without binding options to
|
||||
/// configuration. <see cref="DeploymentManagerOptions"/> falls back to its
|
||||
/// declared defaults unless configured elsewhere. Prefer the
|
||||
/// <see cref="AddDeploymentManager(IServiceCollection, IConfiguration)"/>
|
||||
/// overload so the options are bound to <c>appsettings.json</c>.
|
||||
/// </summary>
|
||||
public static IServiceCollection AddDeploymentManager(this IServiceCollection services)
|
||||
{
|
||||
services.AddSingleton<OperationLockManager>();
|
||||
|
||||
Reference in New Issue
Block a user