From 8c67ffad2a7bcedd1cd54376522b3f710c728760 Mon Sep 17 00:00:00 2001 From: Joseph Doherty Date: Sat, 16 May 2026 21:11:24 -0400 Subject: [PATCH] =?UTF-8?q?fix(deployment-manager):=20resolve=20Deployment?= =?UTF-8?q?Manager-003..011=20=E2=80=94=20atomic=20status=20commit,=20orph?= =?UTF-8?q?an-delete=20handling,=20semaphore=20reclamation,=20structured?= =?UTF-8?q?=20diff,=20options=20binding,=20lifecycle=20test=20coverage?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- code-reviews/DeploymentManager/findings.md | 110 +++++- .../DeploymentService.cs | 122 ++++++- .../OperationLockManager.cs | 125 ++++++- .../ScadaLink.DeploymentManager.csproj | 1 + .../ServiceCollectionExtensions.cs | 32 ++ .../DeploymentServiceTests.cs | 317 +++++++++++++++++- .../OperationLockManagerTests.cs | 48 +++ .../ServiceCollectionExtensionsTests.cs | 49 +++ 8 files changed, 760 insertions(+), 44 deletions(-) create mode 100644 tests/ScadaLink.DeploymentManager.Tests/ServiceCollectionExtensionsTests.cs diff --git a/code-reviews/DeploymentManager/findings.md b/code-reviews/DeploymentManager/findings.md index 00d2d0d..57a88d0 100644 --- a/code-reviews/DeploymentManager/findings.md +++ b/code-reviews/DeploymentManager/findings.md @@ -8,7 +8,7 @@ | Last reviewed | 2026-05-16 | | Reviewer | claude-agent | | Commit reviewed | `9c60592` | -| Open findings | 11 | +| Open findings | 5 | ## Summary @@ -134,7 +134,7 @@ error) if persistence still fails. Regression test: |--|--| | Severity | Medium | | Category | Error handling & resilience | -| Status | Open | +| Status | Resolved | | Location | `src/ScadaLink.DeploymentManager/DeploymentService.cs:155-170` | **Description** @@ -150,6 +150,11 @@ deployment. Central and site are now divergent: the site is running the new config but central still shows the old state and a non-`Success` deployment record. +**Verification:** Confirmed against source. The DeploymentManager-001 fix made +this strictly worse, not better — after that fix a snapshot-store failure is +caught and the record is flipped from `Success` back to `Failed`, so central +reports a *failed* deployment while the site is running the new config. + **Recommendation** Wrap the post-success persistence so that, at minimum, the deployment record's @@ -160,7 +165,15 @@ apply. **Resolution** -_Unresolved._ +Resolved 2026-05-16 (commit pending): `DeployInstanceAsync` now commits the +deployment record's terminal status (`UpdateDeploymentRecordAsync` + +`SaveChangesAsync`) immediately after the site confirms the apply, *before* +touching instance state or the deployed-config snapshot. The post-success +instance-state update and `StoreDeployedSnapshotAsync` are wrapped in a +best-effort `try`/`catch` that logs loudly for operator reconciliation but no +longer flips the already-committed `Success` record back to `Failed`. +Regression test: +`DeployInstanceAsync_SiteSucceeds_SnapshotWriteFails_RecordStillCommittedSuccess`. ### DeploymentManager-004 — Site-success but central-delete-failure leaves orphaned site config @@ -168,7 +181,7 @@ _Unresolved._ |--|--| | Severity | Medium | | Category | Error handling & resilience | -| Status | Open | +| Status | Resolved | | Location | `src/ScadaLink.DeploymentManager/DeploymentService.cs:312-319` | **Description** @@ -182,6 +195,10 @@ normal path (the site no longer has it, so a re-issued delete may fail) and is permanently orphaned. The design states central must not mark the instance deleted until the site confirms — but it does not address the inverse failure. +**Verification:** Confirmed against source. `DeleteInstanceAsync` has no +`try`/`catch` around the post-success block, so any exception from +`DeleteInstanceAsync`/`SaveChangesAsync` escapes uncaught to the caller. + **Recommendation** Catch persistence failures in the post-success block and surface a distinct @@ -191,7 +208,13 @@ idempotent and retryable independently of the site command. **Resolution** -_Unresolved._ +Resolved 2026-05-16 (commit pending): the post-success removal in +`DeleteInstanceAsync` (`DeleteInstanceAsync` + `SaveChangesAsync`) is now +wrapped in a `try`/`catch`. A persistence failure no longer escapes uncaught — +it is logged, recorded with a `DeleteOrphaned` audit entry, and surfaced as a +distinct `Result` failure stating the site deleted the instance but the central +record is orphaned and must be reconciled. Regression test: +`DeleteInstanceAsync_SiteSucceeds_CentralDeleteFails_ReturnsDistinctFailure`. ### DeploymentManager-005 — `OperationLockManager` leaks a `SemaphoreSlim` per instance name @@ -199,7 +222,7 @@ _Unresolved._ |--|--| | Severity | Medium | | Category | Performance & resource management | -| Status | Open | +| Status | Resolved | | Location | `src/ScadaLink.DeploymentManager/OperationLockManager.cs:15-33` | **Description** @@ -213,6 +236,9 @@ with the bulk "deploy all out-of-date instances" workflow and instances that are created and deleted over time — this is an unbounded leak of both managed memory and OS handles. Deleted instances' semaphores are never reclaimed. +**Verification:** Confirmed against source. `_locks` is a `ConcurrentDictionary` +with no removal path anywhere in the type. + **Recommendation** Either accept the leak explicitly and document the expected bounded cardinality @@ -223,7 +249,17 @@ At minimum, remove the semaphore entry when an instance is deleted **Resolution** -_Unresolved._ +Resolved 2026-05-16 (commit pending): `OperationLockManager` now ref-counts each +lock entry. A reference is reserved (creating the entry if needed) before the +`SemaphoreSlim.WaitAsync`, so concurrent waiters for the same instance share one +semaphore and the entry survives until every waiter/holder has released. When +the reference count reaches zero — on release, timeout, or cancellation — the +entry is removed from the dictionary and the semaphore is `Dispose()`d, so the +process no longer accumulates one kernel wait handle per distinct instance name. +A `TrackedLockCount` diagnostic property was added to make reclamation testable. +Regression tests: `AcquireAsync_ReleasedLock_RemovesSemaphoreEntry`, +`AcquireAsync_ManyDistinctInstances_DoesNotAccumulateSemaphores`, +`AcquireAsync_ContendedLock_KeepsSemaphoreUntilLastReleaseThenReclaims`. ### DeploymentManager-006 — Query-the-site-before-redeploy idempotency requirement not implemented @@ -294,7 +330,7 @@ stale-rejection) when the query fails. Regression tests: |--|--| | Severity | Medium | | Category | Design-document adherence | -| Status | Open | +| Status | Resolved | | Location | `src/ScadaLink.DeploymentManager/DeploymentService.cs:334-358,401-406` | **Description** @@ -308,6 +344,12 @@ added/removed/changed detail is produced, and the Template Engine's diff capability is not invoked. The UI cannot render a meaningful diff from this result. +**Verification:** Confirmed against source. The Template Engine already provides +`DiffService` + `ConfigurationDiff` (structured Added/Removed/Changed entries +for attributes, alarms, and scripts, including data connection binding fields), +and `DiffService` is DI-registered — it was simply never wired into the +Deployment Manager's comparison path. + **Recommendation** Either implement a real diff (deserialize the stored @@ -318,7 +360,15 @@ down to staleness detection only. **Resolution** -_Unresolved._ +Resolved 2026-05-16 (commit pending): `GetDeploymentComparisonAsync` now +deserializes the stored `DeployedConfigSnapshot.ConfigurationJson` and runs the +Template Engine `DiffService` against the freshly flattened current +configuration, attaching the resulting `ConfigurationDiff` (added/removed/changed +attributes, alarms, scripts) to a new optional `Diff` property on +`DeploymentComparisonResult`. `DiffService` is injected into `DeploymentService`. +A snapshot that cannot be deserialized (corrupt / older schema) still yields the +hash-based staleness result with a null diff, logged at warning level. +Regression test: `GetDeploymentComparisonAsync_ProducesStructuredDiff`. ### DeploymentManager-008 — `DeploymentManagerOptions` is never bound to configuration @@ -326,7 +376,7 @@ _Unresolved._ |--|--| | Severity | Medium | | Category | Code organization & conventions | -| Status | Open | +| Status | Resolved | | Location | `src/ScadaLink.DeploymentManager/ServiceCollectionExtensions.cs:7-14` | **Description** @@ -341,6 +391,9 @@ to options classes (Options pattern)." `Host/Program.cs` binds `SecurityOptions` and `InboundApiOptions` from configuration sections but has no equivalent for `DeploymentManagerOptions`. +**Verification:** Confirmed against source. Neither `AddDeploymentManager` nor +`Host/Program.cs` binds `DeploymentManagerOptions`. + **Recommendation** Add an `IConfiguration` parameter (or a configure callback) to @@ -349,7 +402,18 @@ Add an `IConfiguration` parameter (or a configure callback) to **Resolution** -_Unresolved._ +Resolved 2026-05-16 (commit pending): added an +`AddDeploymentManager(IServiceCollection, IConfiguration)` overload that binds +`DeploymentManagerOptions` to the `ScadaLink:DeploymentManager` configuration +section (exposed as `ServiceCollectionExtensions.OptionsSection`). The +`Microsoft.Extensions.Options.ConfigurationExtensions` package was added to the +project. The original parameterless overload is retained for callers/tests that +do not bind configuration. Regression tests: +`AddDeploymentManager_WithConfiguration_BindsDeploymentManagerOptions`, +`AddDeploymentManager_WithConfiguration_MissingSection_UsesDefaults`. Note: a +one-line follow-up in `Host/Program.cs` (call the new overload with +`builder.Configuration`) is required to take effect at runtime — that file is +outside this module's edit scope and is surfaced for the Host owner. ### DeploymentManager-009 — Misleading timeout comment on `DeleteInstanceAsync` @@ -415,7 +479,7 @@ _Unresolved._ |--|--| | Severity | Medium | | Category | Testing coverage | -| Status | Open | +| Status | Resolved | | Location | `tests/ScadaLink.DeploymentManager.Tests/DeploymentServiceTests.cs:100-151,155-199` | **Description** @@ -432,6 +496,13 @@ critical post-response branch (`DeploymentService.cs:154-184`) and the entire delete/disable/enable success path are untested. The `AuditLogs` test (lines 277-289) asserts nothing. +**Verification:** Partially confirmed. By the time this finding was being +resolved, the DeploymentManager-006 fix had already introduced a TestKit-actor +seam (`CreateServiceWithCommActor` + `ReconcileProbeActor`) and successful-deploy +tests. The genuinely-still-missing coverage was: successful Disable/Enable/Delete +paths, per-instance lock serialization during deploy, and the assertionless +`AuditLogs` test — those gaps were addressed. + **Recommendation** Introduce a seam to inject a fake/substitute communication path (e.g. an @@ -442,7 +513,20 @@ test assert on `IAuditService.LogAsync`. **Resolution** -_Unresolved._ +Resolved 2026-05-16 (commit pending): extended the TestKit-actor seam +(`ReconcileProbeActor` now also answers lifecycle commands) and added the +missing coverage — successful Disable/Enable/Delete (state transition + audit +assertions), a successful-deploy audit assertion, and per-instance lock +serialization via a new deferred-reply `SerializationProbeActor` that asserts a +single instance's concurrent deploys never overlap. The assertionless `AuditLogs` +test was replaced with `DeployInstanceAsync_FlatteningFails_DoesNotReachAudit`, +which asserts on `IAuditService.LogAsync`. Regression tests: +`DisableInstanceAsync_SiteSucceeds_SetsDisabledStateAndAudits`, +`EnableInstanceAsync_SiteSucceeds_SetsEnabledStateAndAudits`, +`DeleteInstanceAsync_SiteSucceeds_RemovesRecordAndAudits`, +`DeployInstanceAsync_SiteSucceeds_WritesDeployAuditEntry`, +`DeployInstanceAsync_FlatteningFails_DoesNotReachAudit`, +`DeployInstanceAsync_SameInstance_OperationLockSerializesConcurrentDeploys`. ### DeploymentManager-012 — `LifecycleCommandTimeout` option is dead code diff --git a/src/ScadaLink.DeploymentManager/DeploymentService.cs b/src/ScadaLink.DeploymentManager/DeploymentService.cs index 9b8f031..6f3ad50 100644 --- a/src/ScadaLink.DeploymentManager/DeploymentService.cs +++ b/src/ScadaLink.DeploymentManager/DeploymentService.cs @@ -40,6 +40,7 @@ public class DeploymentService private readonly CommunicationService _communicationService; private readonly OperationLockManager _lockManager; private readonly IAuditService _auditService; + private readonly DiffService _diffService; private readonly DeploymentManagerOptions _options; private readonly ILogger _logger; @@ -58,6 +59,7 @@ public class DeploymentService CommunicationService communicationService, OperationLockManager lockManager, IAuditService auditService, + DiffService diffService, IOptions options, ILogger logger) { @@ -67,6 +69,7 @@ public class DeploymentService _communicationService = communicationService; _lockManager = lockManager; _auditService = auditService; + _diffService = diffService; _options = options.Value; _logger = logger; } @@ -171,24 +174,47 @@ public class DeploymentService var response = await _communicationService.DeployInstanceAsync(siteId, command, cancellationToken); - // WP-1: Update status based on site response + // WP-1: Update status based on site response. record.Status = response.Status; record.ErrorMessage = response.ErrorMessage; record.CompletedAt = DateTimeOffset.UtcNow; + + // DeploymentManager-003: once the site has confirmed the apply, + // commit the deployment record's terminal status BEFORE touching + // instance state and the deployed-config snapshot. If a later write + // (instance update / snapshot store) fails, the recorded fact that + // the site succeeded must NOT be lost -- otherwise central reports a + // non-Success record while the site is running the new config. await _repository.UpdateDeploymentRecordAsync(record, cancellationToken); + await _repository.SaveChangesAsync(cancellationToken); if (response.Status == DeploymentStatus.Success) { - // WP-4: Update instance state to Enabled on successful deployment - instance.State = InstanceState.Enabled; - await _repository.UpdateInstanceAsync(instance, cancellationToken); + // The site has applied the deployment. The post-success + // persistence below is best-effort: a failure here must be + // logged loudly for operator reconciliation but must not flip + // the already-committed Success record back to Failed. + try + { + // WP-4: Update instance state to Enabled on successful deployment + instance.State = InstanceState.Enabled; + await _repository.UpdateInstanceAsync(instance, cancellationToken); - // WP-8: Store deployed config snapshot - await StoreDeployedSnapshotAsync(instanceId, deploymentId, revisionHash, configJson, cancellationToken); + // WP-8: Store deployed config snapshot + await StoreDeployedSnapshotAsync(instanceId, deploymentId, revisionHash, configJson, cancellationToken); + + await _repository.SaveChangesAsync(cancellationToken); + } + catch (Exception postEx) + { + _logger.LogError(postEx, + "Deployment {DeploymentId} for instance {Instance} was applied by the site and " + + "recorded Success, but post-success persistence (instance state / config snapshot) " + + "failed -- central and site state may diverge until reconciled", + deploymentId, instance.UniqueName); + } } - await _repository.SaveChangesAsync(cancellationToken); - // Audit log await _auditService.LogAsync(user, "Deploy", "Instance", instanceId.ToString(), instance.UniqueName, new { DeploymentId = deploymentId, Status = record.Status.ToString() }, @@ -368,8 +394,34 @@ public class DeploymentService // Delete means delete: remove the instance record entirely. // Deployment records, snapshot, overrides, and connection bindings // are removed with it (see repository implementation). - await _repository.DeleteInstanceAsync(instanceId, cancellationToken); - await _repository.SaveChangesAsync(cancellationToken); + // + // DeploymentManager-004: the site has already destroyed the Instance + // Actor and removed its config. If the central record removal now + // fails (DB error / concurrency), the exception must NOT escape + // uncaught -- that would leave the central record orphaned and + // un-deletable through the normal path (a re-issued delete may fail + // because the site no longer has the instance). Surface a distinct + // failure so an operator can reconcile. + try + { + await _repository.DeleteInstanceAsync(instanceId, cancellationToken); + await _repository.SaveChangesAsync(cancellationToken); + } + catch (Exception ex) + { + _logger.LogError(ex, + "Instance {Instance} was deleted at the site, but the central record could not be " + + "removed -- the central record is now orphaned and must be reconciled manually", + instance.UniqueName); + + await _auditService.LogAsync(user, "DeleteOrphaned", "Instance", instanceId.ToString(), + instance.UniqueName, new { CommandId = commandId, Error = ex.Message }, + CancellationToken.None); + + return Result.Failure( + $"The site deleted instance '{instance.UniqueName}', but the central record could not " + + $"be removed: {ex.Message}. The central record is orphaned and must be reconciled."); + } } await _auditService.LogAsync(user, "Delete", "Instance", instanceId.ToString(), @@ -383,7 +435,12 @@ public class DeploymentService } /// - /// WP-8: Get the deployed config snapshot and compare with current template-derived state. + /// WP-8: Get the deployed config snapshot and compare with current + /// template-derived state. Produces both a staleness flag and — per the + /// design's "Diff View" — a structured of + /// added/removed/changed attributes, alarms, and scripts (including data + /// connection binding changes) computed by the TemplateEngine + /// . /// public async Task> GetDeploymentComparisonAsync( int instanceId, @@ -398,15 +455,47 @@ public class DeploymentService if (currentResult.IsFailure) return Result.Failure($"Cannot compute current config: {currentResult.Error}"); + var currentConfig = currentResult.Value.Configuration; var currentHash = currentResult.Value.RevisionHash; var isStale = snapshot.RevisionHash != currentHash; + // DeploymentManager-007: deserialize the deployed snapshot and run the + // TemplateEngine DiffService so the result carries real + // added/removed/changed detail, not just a hash comparison. A snapshot + // that cannot be deserialized (corrupt / older schema) still yields the + // hash-based staleness result, with a null diff. + ConfigurationDiff? diff = null; + try + { + var deployedConfig = JsonSerializer.Deserialize(snapshot.ConfigurationJson); + if (deployedConfig != null) + { + diff = _diffService.ComputeDiff( + deployedConfig, currentConfig, snapshot.RevisionHash, currentHash); + } + else + { + _logger.LogWarning( + "Deployed snapshot for instance {InstanceId} deserialized to null; " + + "returning hash-based comparison without a structured diff", + instanceId); + } + } + catch (JsonException ex) + { + _logger.LogWarning(ex, + "Could not deserialize deployed snapshot for instance {InstanceId}; " + + "returning hash-based comparison without a structured diff", + instanceId); + } + var result = new DeploymentComparisonResult( instanceId, snapshot.RevisionHash, currentHash, isStale, - snapshot.DeployedAt); + snapshot.DeployedAt, + diff); return Result.Success(result); } @@ -551,9 +640,16 @@ public class DeploymentService /// /// WP-8: Result of comparing deployed vs template-derived configuration. /// +/// +/// DeploymentManager-007: structured added/removed/changed detail for +/// attributes, alarms, and scripts. Null only when the deployed snapshot could +/// not be deserialized (corrupt / older schema), in which case +/// still reflects the hash comparison. +/// public record DeploymentComparisonResult( int InstanceId, string DeployedRevisionHash, string CurrentRevisionHash, bool IsStale, - DateTimeOffset DeployedAt); + DateTimeOffset DeployedAt, + ConfigurationDiff? Diff = null); diff --git a/src/ScadaLink.DeploymentManager/OperationLockManager.cs b/src/ScadaLink.DeploymentManager/OperationLockManager.cs index 5ffef52..8643afc 100644 --- a/src/ScadaLink.DeploymentManager/OperationLockManager.cs +++ b/src/ScadaLink.DeploymentManager/OperationLockManager.cs @@ -6,13 +6,34 @@ namespace ScadaLink.DeploymentManager; /// WP-3: Per-instance operation lock. Only one mutating operation (deploy, disable, enable, delete) /// may be in progress per instance at a time. Different instances can proceed in parallel. /// -/// Implementation: ConcurrentDictionary of SemaphoreSlim(1,1) keyed by instance unique name. -/// Lock released on completion, timeout, or failure. +/// Implementation: ConcurrentDictionary of ref-counted SemaphoreSlim(1,1) keyed by instance +/// unique name. The lock is released on completion, timeout, or failure. /// Lost on central failover (acceptable per design -- in-progress treated as failed). +/// +/// DeploymentManager-005: each entry is ref-counted. The semaphore is created on the +/// first acquire/wait, shared while there are waiters or a holder, and removed + +/// d when the last reference is released — so the dictionary +/// does not accumulate one kernel wait handle per distinct instance name forever. /// public class OperationLockManager { - private readonly ConcurrentDictionary _locks = new(StringComparer.Ordinal); + private readonly object _gate = new(); + private readonly Dictionary _locks = new(StringComparer.Ordinal); + + /// + /// Number of lock entries currently tracked. Used for diagnostics and to + /// verify that semaphores are reclaimed (DeploymentManager-005). + /// + public int TrackedLockCount + { + get + { + lock (_gate) + { + return _locks.Count; + } + } + } /// /// Acquires the operation lock for the given instance. Returns a disposable that releases the lock. @@ -20,16 +41,40 @@ public class OperationLockManager /// public async Task AcquireAsync(string instanceUniqueName, TimeSpan timeout, CancellationToken cancellationToken = default) { - var semaphore = _locks.GetOrAdd(instanceUniqueName, _ => new SemaphoreSlim(1, 1)); - - if (!await semaphore.WaitAsync(timeout, cancellationToken)) + // Reserve a reference (creating the entry if needed) BEFORE waiting, so a + // concurrent waiter for the same instance shares the same semaphore and + // the entry survives until every waiter/holder has released it. + LockEntry entry; + lock (_gate) { - throw new TimeoutException( - $"Could not acquire operation lock for instance '{instanceUniqueName}' within {timeout.TotalSeconds}s. " + - "Another mutating operation is in progress."); + if (!_locks.TryGetValue(instanceUniqueName, out entry!)) + { + entry = new LockEntry(); + _locks[instanceUniqueName] = entry; + } + entry.RefCount++; } - return new LockRelease(semaphore); + try + { + if (!await entry.Semaphore.WaitAsync(timeout, cancellationToken)) + { + throw new TimeoutException( + $"Could not acquire operation lock for instance '{instanceUniqueName}' within {timeout.TotalSeconds}s. " + + "Another mutating operation is in progress."); + } + } + catch (Exception) when (DropReferenceOnFailure(instanceUniqueName, entry)) + { + // DropReferenceOnFailure always returns false; the filter just runs + // the cleanup so the reservation is not leaked when WaitAsync throws + // or times out (TimeoutException / OperationCanceledException). The + // exception still propagates. The semaphore was NOT entered on any + // of these paths, so only the reference is dropped. + throw; + } + + return new LockRelease(this, instanceUniqueName, entry); } /// @@ -37,21 +82,73 @@ public class OperationLockManager /// public bool IsLocked(string instanceUniqueName) { - return _locks.TryGetValue(instanceUniqueName, out var semaphore) && semaphore.CurrentCount == 0; + lock (_gate) + { + return _locks.TryGetValue(instanceUniqueName, out var entry) && entry.Semaphore.CurrentCount == 0; + } + } + + private bool DropReferenceOnFailure(string instanceUniqueName, LockEntry entry) + { + ReleaseReference(instanceUniqueName, entry, semaphoreWasEntered: false); + return false; + } + + /// + /// Drops one reference to the entry. When + /// is true the semaphore is released first. When the reference count reaches + /// zero the entry is removed from the dictionary and the semaphore disposed. + /// + private void ReleaseReference(string instanceUniqueName, LockEntry entry, bool semaphoreWasEntered) + { + lock (_gate) + { + // Release the semaphore (handing the lock to any waiter) and drop the + // reference under the same gate, so the dispose decision below cannot + // race with the Release on an entry that another caller is reclaiming. + if (semaphoreWasEntered) + { + entry.Semaphore.Release(); + } + + entry.RefCount--; + if (entry.RefCount <= 0 && + _locks.TryGetValue(instanceUniqueName, out var current) && + ReferenceEquals(current, entry)) + { + _locks.Remove(instanceUniqueName); + entry.Semaphore.Dispose(); + } + } + } + + private sealed class LockEntry + { + public readonly SemaphoreSlim Semaphore = new(1, 1); + + /// Number of in-flight acquires (waiters + the current holder). Guarded by . + public int RefCount; } private sealed class LockRelease : IDisposable { - private readonly SemaphoreSlim _semaphore; + private readonly OperationLockManager _owner; + private readonly string _instanceUniqueName; + private readonly LockEntry _entry; private int _disposed; - public LockRelease(SemaphoreSlim semaphore) => _semaphore = semaphore; + public LockRelease(OperationLockManager owner, string instanceUniqueName, LockEntry entry) + { + _owner = owner; + _instanceUniqueName = instanceUniqueName; + _entry = entry; + } public void Dispose() { if (Interlocked.CompareExchange(ref _disposed, 1, 0) == 0) { - _semaphore.Release(); + _owner.ReleaseReference(_instanceUniqueName, _entry, semaphoreWasEntered: true); } } } diff --git a/src/ScadaLink.DeploymentManager/ScadaLink.DeploymentManager.csproj b/src/ScadaLink.DeploymentManager/ScadaLink.DeploymentManager.csproj index 4d0eec3..ff9fe7d 100644 --- a/src/ScadaLink.DeploymentManager/ScadaLink.DeploymentManager.csproj +++ b/src/ScadaLink.DeploymentManager/ScadaLink.DeploymentManager.csproj @@ -11,6 +11,7 @@ + diff --git a/src/ScadaLink.DeploymentManager/ServiceCollectionExtensions.cs b/src/ScadaLink.DeploymentManager/ServiceCollectionExtensions.cs index b199a1f..955d620 100644 --- a/src/ScadaLink.DeploymentManager/ServiceCollectionExtensions.cs +++ b/src/ScadaLink.DeploymentManager/ServiceCollectionExtensions.cs @@ -1,9 +1,41 @@ +using Microsoft.Extensions.Configuration; using Microsoft.Extensions.DependencyInjection; namespace ScadaLink.DeploymentManager; public static class ServiceCollectionExtensions { + /// + /// Configuration section that is bound to. + /// + public const string OptionsSection = "ScadaLink:DeploymentManager"; + + /// + /// Registers the Deployment Manager services and binds + /// to the + /// configuration section, consistent with the + /// Options-pattern convention ("Per-component configuration via + /// appsettings.json sections bound to options classes"). + /// + public static IServiceCollection AddDeploymentManager( + this IServiceCollection services, + IConfiguration configuration) + { + ArgumentNullException.ThrowIfNull(configuration); + + // DeploymentManager-008: bind the options class so the operation-lock + // and artifact-deployment timeouts are tunable via appsettings.json. + services.Configure(configuration.GetSection(OptionsSection)); + return services.AddDeploymentManager(); + } + + /// + /// Registers the Deployment Manager services without binding options to + /// configuration. falls back to its + /// declared defaults unless configured elsewhere. Prefer the + /// + /// overload so the options are bound to appsettings.json. + /// public static IServiceCollection AddDeploymentManager(this IServiceCollection services) { services.AddSingleton(); diff --git a/tests/ScadaLink.DeploymentManager.Tests/DeploymentServiceTests.cs b/tests/ScadaLink.DeploymentManager.Tests/DeploymentServiceTests.cs index 9b382e0..824bd13 100644 --- a/tests/ScadaLink.DeploymentManager.Tests/DeploymentServiceTests.cs +++ b/tests/ScadaLink.DeploymentManager.Tests/DeploymentServiceTests.cs @@ -13,6 +13,7 @@ using ScadaLink.Commons.Types; using ScadaLink.Commons.Types.Enums; using ScadaLink.Commons.Types.Flattening; using ScadaLink.Communication; +using ScadaLink.TemplateEngine.Flattening; namespace ScadaLink.DeploymentManager.Tests; @@ -45,7 +46,8 @@ public class DeploymentServiceTests : TestKit var siteRepo = Substitute.For(); _service = new DeploymentService( - _repo, siteRepo, _pipeline, _comms, _lockManager, _audit, options, + _repo, siteRepo, _pipeline, _comms, _lockManager, _audit, + new DiffService(), options, NullLogger.Instance); } @@ -276,6 +278,34 @@ public class DeploymentServiceTests : TestKit Assert.Contains("not found", result.Error); } + // ── DeploymentManager-004: site-success but central-delete-failure must not escape uncaught ── + + [Fact] + public async Task DeleteInstanceAsync_SiteSucceeds_CentralDeleteFails_ReturnsDistinctFailure() + { + // The site destroys the Instance Actor and removes its config (response + // Success), but the central record removal throws. The exception must + // NOT propagate uncaught -- it must be surfaced as a distinct failure so + // an operator can reconcile the orphaned central record. + var instance = new Instance("OrphanInst") { Id = 30, SiteId = 1, State = InstanceState.Enabled }; + _repo.GetInstanceByIdAsync(30, Arg.Any()).Returns(instance); + + _repo.DeleteInstanceAsync(30, Arg.Any()) + .Returns(_ => throw new InvalidOperationException("db unavailable")); + + var commActor = Sys.ActorOf(Props.Create(() => + new ReconcileProbeActor(siteHash: "sha256:x", failQuery: false))); + var service = CreateServiceWithCommActor(commActor); + + var result = await service.DeleteInstanceAsync(30, "admin"); + + // The failure is surfaced (not thrown) and clearly says the site + // succeeded but the central record could not be removed. + Assert.True(result.IsFailure); + Assert.Contains("site", result.Error, StringComparison.OrdinalIgnoreCase); + Assert.Contains("central", result.Error, StringComparison.OrdinalIgnoreCase); + } + // ── WP-8: Deployment comparison ── [Fact] @@ -331,6 +361,51 @@ public class DeploymentServiceTests : TestKit Assert.True(result.Value.IsStale); } + // ── DeploymentManager-007: comparison must produce a structured diff ── + + [Fact] + public async Task GetDeploymentComparisonAsync_ProducesStructuredDiff() + { + // The deployed snapshot has one attribute; the current template-derived + // config has a different attribute. The comparison must surface a real + // Added/Removed diff via the TemplateEngine DiffService, not just a + // boolean staleness flag. + var deployedConfig = new FlattenedConfiguration + { + InstanceUniqueName = "DiffInst", + Attributes = [new ResolvedAttribute { CanonicalName = "OldAttr", DataType = "Int" }] + }; + var snapshot = new DeployedConfigSnapshot( + "dep1", "sha256:old", System.Text.Json.JsonSerializer.Serialize(deployedConfig)) + { + InstanceId = 40, + DeployedAt = DateTimeOffset.UtcNow + }; + _repo.GetDeployedSnapshotByInstanceIdAsync(40, Arg.Any()).Returns(snapshot); + + var currentConfig = new FlattenedConfiguration + { + InstanceUniqueName = "DiffInst", + Attributes = [new ResolvedAttribute { CanonicalName = "NewAttr", DataType = "Int" }] + }; + _pipeline.FlattenAndValidateAsync(40, Arg.Any()) + .Returns(Result.Success( + new FlatteningPipelineResult(currentConfig, "sha256:new", ValidationResult.Success()))); + + var result = await _service.GetDeploymentComparisonAsync(40); + + Assert.True(result.IsSuccess); + Assert.True(result.Value.IsStale); + + // A structured diff is present with the added and removed attributes. + Assert.NotNull(result.Value.Diff); + Assert.True(result.Value.Diff!.HasChanges); + Assert.Contains(result.Value.Diff.AttributeChanges, + c => c.CanonicalName == "NewAttr" && c.ChangeType == DiffChangeType.Added); + Assert.Contains(result.Value.Diff.AttributeChanges, + c => c.CanonicalName == "OldAttr" && c.ChangeType == DiffChangeType.Removed); + } + // ── WP-2: GetDeploymentStatusAsync ── [Fact] @@ -352,8 +427,11 @@ public class DeploymentServiceTests : TestKit // ── Audit logging ── [Fact] - public async Task DeployInstanceAsync_AuditLogs() + public async Task DeployInstanceAsync_FlatteningFails_DoesNotReachAudit() { + // DeploymentManager-011: this test previously asserted nothing. A + // flatten failure returns before any site communication, so no audit + // entry is written. var instance = new Instance("TestInst") { Id = 1, SiteId = 1, State = InstanceState.NotDeployed }; _repo.GetInstanceByIdAsync(1).Returns(instance); @@ -362,8 +440,120 @@ public class DeploymentServiceTests : TestKit await _service.DeployInstanceAsync(1, "admin"); - // Failure case does not reach audit (returns before communication) - // The audit is only logged after communication succeeds/fails + await _audit.DidNotReceive().LogAsync( + Arg.Any(), Arg.Any(), Arg.Any(), Arg.Any(), + Arg.Any(), Arg.Any(), Arg.Any()); + } + + [Fact] + public async Task DeployInstanceAsync_SiteSucceeds_WritesDeployAuditEntry() + { + // DeploymentManager-011: a successful deployment must write a "Deploy" + // audit entry referencing the deployed instance. + var instance = new Instance("AuditInst") { Id = 50, SiteId = 1, State = InstanceState.NotDeployed }; + _repo.GetInstanceByIdAsync(50, Arg.Any()).Returns(instance); + SetupValidPipeline(50, "AuditInst", "sha256:target"); + _repo.GetCurrentDeploymentStatusAsync(50, Arg.Any()) + .Returns((DeploymentRecord?)null); + + var commActor = Sys.ActorOf(Props.Create(() => + new ReconcileProbeActor(siteHash: "sha256:target", failQuery: false))); + var service = CreateServiceWithCommActor(commActor); + + var result = await service.DeployInstanceAsync(50, "admin"); + + Assert.True(result.IsSuccess); + await _audit.Received().LogAsync( + "admin", "Deploy", "Instance", "50", "AuditInst", + Arg.Any(), Arg.Any()); + } + + // ── DeploymentManager-011: lifecycle success paths ── + + [Fact] + public async Task DisableInstanceAsync_SiteSucceeds_SetsDisabledStateAndAudits() + { + var instance = new Instance("DisInst") { Id = 51, SiteId = 1, State = InstanceState.Enabled }; + _repo.GetInstanceByIdAsync(51, Arg.Any()).Returns(instance); + + var commActor = Sys.ActorOf(Props.Create(() => + new ReconcileProbeActor(siteHash: "x", failQuery: false))); + var service = CreateServiceWithCommActor(commActor); + + var result = await service.DisableInstanceAsync(51, "admin"); + + Assert.True(result.IsSuccess); + Assert.Equal(InstanceState.Disabled, instance.State); + await _repo.Received().UpdateInstanceAsync(instance, Arg.Any()); + await _audit.Received().LogAsync( + "admin", "Disable", "Instance", "51", "DisInst", + Arg.Any(), Arg.Any()); + } + + [Fact] + public async Task EnableInstanceAsync_SiteSucceeds_SetsEnabledStateAndAudits() + { + var instance = new Instance("EnInst") { Id = 52, SiteId = 1, State = InstanceState.Disabled }; + _repo.GetInstanceByIdAsync(52, Arg.Any()).Returns(instance); + + var commActor = Sys.ActorOf(Props.Create(() => + new ReconcileProbeActor(siteHash: "x", failQuery: false))); + var service = CreateServiceWithCommActor(commActor); + + var result = await service.EnableInstanceAsync(52, "admin"); + + Assert.True(result.IsSuccess); + Assert.Equal(InstanceState.Enabled, instance.State); + await _repo.Received().UpdateInstanceAsync(instance, Arg.Any()); + await _audit.Received().LogAsync( + "admin", "Enable", "Instance", "52", "EnInst", + Arg.Any(), Arg.Any()); + } + + [Fact] + public async Task DeleteInstanceAsync_SiteSucceeds_RemovesRecordAndAudits() + { + var instance = new Instance("DelInst") { Id = 53, SiteId = 1, State = InstanceState.Enabled }; + _repo.GetInstanceByIdAsync(53, Arg.Any()).Returns(instance); + + var commActor = Sys.ActorOf(Props.Create(() => + new ReconcileProbeActor(siteHash: "x", failQuery: false))); + var service = CreateServiceWithCommActor(commActor); + + var result = await service.DeleteInstanceAsync(53, "admin"); + + Assert.True(result.IsSuccess); + await _repo.Received().DeleteInstanceAsync(53, Arg.Any()); + await _audit.Received().LogAsync( + "admin", "Delete", "Instance", "53", "DelInst", + Arg.Any(), Arg.Any()); + } + + [Fact] + public async Task DeployInstanceAsync_SameInstance_OperationLockSerializesConcurrentDeploys() + { + // DeploymentManager-011: two concurrent deploys of the SAME instance + // must be serialized by the per-instance operation lock — the site sees + // them one at a time, never overlapping. + var instance = new Instance("LockInst") { Id = 54, SiteId = 1, State = InstanceState.Enabled }; + _repo.GetInstanceByIdAsync(54, Arg.Any()).Returns(instance); + SetupValidPipeline(54, "LockInst", "sha256:target"); + _repo.GetCurrentDeploymentStatusAsync(54, Arg.Any()) + .Returns((DeploymentRecord?)null); + + var commActor = Sys.ActorOf(Props.Create(() => + new SerializationProbeActor())); + var service = CreateServiceWithCommActor(commActor); + + var deploy1 = service.DeployInstanceAsync(54, "admin"); + var deploy2 = service.DeployInstanceAsync(54, "admin"); + var results = await Task.WhenAll(deploy1, deploy2); + + Assert.True(results[0].IsSuccess); + Assert.True(results[1].IsSuccess); + // The probe records the maximum concurrency observed; the lock must + // keep it at 1 for a single instance. + Assert.Equal(1, SerializationProbeActor.MaxConcurrent); } // ── DeploymentManager-006: query-the-site-before-redeploy idempotency ── @@ -386,6 +576,7 @@ public class DeploymentServiceTests : TestKit var siteRepo = Substitute.For(); return new DeploymentService( _repo, siteRepo, _pipeline, comms, _lockManager, _audit, + new DiffService(), Options.Create(new DeploymentManagerOptions { OperationLockTimeout = TimeSpan.FromSeconds(5) }), NullLogger.Instance); } @@ -572,6 +763,109 @@ public class DeploymentServiceTests : TestKit Assert.Equal(1, ReconcileProbeActor.DeployCount); } + // ── DeploymentManager-003: post-success persistence must commit the Success status ── + + [Fact] + public async Task DeployInstanceAsync_SiteSucceeds_SnapshotWriteFails_RecordStillCommittedSuccess() + { + // The site applies the deployment (response Success), but storing the + // deployed-config snapshot afterwards throws. The deployment record's + // Success status MUST still be durably committed -- otherwise central + // and site diverge: the site runs the new config while central shows a + // non-Success record forever. + var instance = new Instance("SnapFailInst") { Id = 20, SiteId = 1, State = InstanceState.NotDeployed }; + _repo.GetInstanceByIdAsync(20, Arg.Any()).Returns(instance); + SetupValidPipeline(20, "SnapFailInst", "sha256:target"); + _repo.GetCurrentDeploymentStatusAsync(20, Arg.Any()) + .Returns((DeploymentRecord?)null); + + DeploymentRecord? captured = null; + await _repo.AddDeploymentRecordAsync( + Arg.Do(r => captured = r), Arg.Any()); + + // The snapshot store throws. + _repo.GetDeployedSnapshotByInstanceIdAsync(20, Arg.Any()) + .Returns((DeployedConfigSnapshot?)null); + _repo.AddDeployedSnapshotAsync(Arg.Any(), Arg.Any()) + .Returns(_ => throw new InvalidOperationException("snapshot store unavailable")); + + var commActor = Sys.ActorOf(Props.Create(() => + new ReconcileProbeActor(siteHash: "sha256:target", failQuery: false))); + var service = CreateServiceWithCommActor(commActor); + + var result = await service.DeployInstanceAsync(20, "admin"); + + // The site succeeded -> the deployment is reported successful. + Assert.True(result.IsSuccess); + Assert.NotNull(captured); + Assert.Equal(DeploymentStatus.Success, captured!.Status); + + // The Success status was committed (a SaveChanges happened with the + // record in Success state) BEFORE the snapshot write was attempted. + await _repo.Received().UpdateDeploymentRecordAsync( + Arg.Is(r => r.Status == DeploymentStatus.Success), + Arg.Any()); + } + + /// + /// Stand-in CentralCommunicationActor that measures deploy concurrency. It + /// defers each deploy reply via the scheduler, so if two deploys for the + /// same instance were NOT serialized by the operation lock their windows + /// would overlap and would exceed 1. + /// + private class SerializationProbeActor : ReceiveActor, IWithTimers + { + public static int MaxConcurrent; + private static int _current; + private static readonly object Gate = new(); + + public ITimerScheduler Timers { get; set; } = null!; + + public SerializationProbeActor() + { + MaxConcurrent = 0; + _current = 0; + + Receive(env => + { + if (env.Message is DeployInstanceCommand d) + { + lock (Gate) + { + _current++; + if (_current > MaxConcurrent) MaxConcurrent = _current; + } + + var replyTo = Sender; + // Defer the reply so the deploy "window" stays open long + // enough for a non-serialized second deploy to overlap. + Timers.StartSingleTimer( + d.DeploymentId, + new CompleteDeploy(d, replyTo), + TimeSpan.FromMilliseconds(150)); + } + else if (env.Message is DeploymentStateQueryRequest q) + { + Sender.Tell(new DeploymentStateQueryResponse( + q.CorrelationId, q.InstanceUniqueName, false, null, null, DateTimeOffset.UtcNow)); + } + }); + + Receive(c => + { + lock (Gate) + { + _current--; + } + c.ReplyTo.Tell(new DeploymentStatusResponse( + c.Command.DeploymentId, c.Command.InstanceUniqueName, + DeploymentStatus.Success, null, DateTimeOffset.UtcNow)); + }); + } + + private sealed record CompleteDeploy(DeployInstanceCommand Command, IActorRef ReplyTo); + } + /// /// Stand-in CentralCommunicationActor for reconciliation tests. Counts the /// site queries and deploy commands it receives, answers queries with a @@ -610,6 +904,21 @@ public class DeploymentServiceTests : TestKit d.DeploymentId, d.InstanceUniqueName, DeploymentStatus.Success, null, DateTimeOffset.UtcNow)); break; + + case DisableInstanceCommand dis: + Sender.Tell(new InstanceLifecycleResponse( + dis.CommandId, dis.InstanceUniqueName, true, null, DateTimeOffset.UtcNow)); + break; + + case EnableInstanceCommand en: + Sender.Tell(new InstanceLifecycleResponse( + en.CommandId, en.InstanceUniqueName, true, null, DateTimeOffset.UtcNow)); + break; + + case DeleteInstanceCommand del: + Sender.Tell(new InstanceLifecycleResponse( + del.CommandId, del.InstanceUniqueName, true, null, DateTimeOffset.UtcNow)); + break; } }); } diff --git a/tests/ScadaLink.DeploymentManager.Tests/OperationLockManagerTests.cs b/tests/ScadaLink.DeploymentManager.Tests/OperationLockManagerTests.cs index 8dea4d7..66dd36d 100644 --- a/tests/ScadaLink.DeploymentManager.Tests/OperationLockManagerTests.cs +++ b/tests/ScadaLink.DeploymentManager.Tests/OperationLockManagerTests.cs @@ -92,4 +92,52 @@ public class OperationLockManagerTests await Task.WhenAll(tasks); } + + // ── DeploymentManager-005: semaphore must not leak ── + + [Fact] + public async Task AcquireAsync_ReleasedLock_RemovesSemaphoreEntry() + { + // A semaphore that is created, used, and fully released must not be + // retained — otherwise every distinct instance name leaks a + // SemaphoreSlim (a kernel handle) for the life of the process. + using (await _lockManager.AcquireAsync("transient-inst", TimeSpan.FromSeconds(5))) + { + Assert.Equal(1, _lockManager.TrackedLockCount); + } + + // After release, the entry is reclaimed. + Assert.Equal(0, _lockManager.TrackedLockCount); + } + + [Fact] + public async Task AcquireAsync_ManyDistinctInstances_DoesNotAccumulateSemaphores() + { + // Simulates the long-running central process: many instances are + // deployed/disabled over time. Their semaphores must be reclaimed. + for (var i = 0; i < 100; i++) + { + using var handle = await _lockManager.AcquireAsync($"churn-{i}", TimeSpan.FromSeconds(5)); + } + + Assert.Equal(0, _lockManager.TrackedLockCount); + } + + [Fact] + public async Task AcquireAsync_ContendedLock_KeepsSemaphoreUntilLastReleaseThenReclaims() + { + // While a second caller is waiting, the semaphore must survive the + // first release; only when the last holder releases is it reclaimed. + var first = await _lockManager.AcquireAsync("contended", TimeSpan.FromSeconds(5)); + + var secondAcquire = _lockManager.AcquireAsync("contended", TimeSpan.FromSeconds(5)); + + first.Dispose(); // hands the lock to the waiter; entry must NOT be removed + var second = await secondAcquire; + + Assert.Equal(1, _lockManager.TrackedLockCount); + second.Dispose(); + + Assert.Equal(0, _lockManager.TrackedLockCount); + } } diff --git a/tests/ScadaLink.DeploymentManager.Tests/ServiceCollectionExtensionsTests.cs b/tests/ScadaLink.DeploymentManager.Tests/ServiceCollectionExtensionsTests.cs new file mode 100644 index 0000000..7503814 --- /dev/null +++ b/tests/ScadaLink.DeploymentManager.Tests/ServiceCollectionExtensionsTests.cs @@ -0,0 +1,49 @@ +using Microsoft.Extensions.Configuration; +using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Options; + +namespace ScadaLink.DeploymentManager.Tests; + +/// +/// DeploymentManager-008: DeploymentManagerOptions must be bound to the +/// "ScadaLink:DeploymentManager" configuration section, consistent with the +/// Options-pattern convention used by the other components. +/// +public class ServiceCollectionExtensionsTests +{ + [Fact] + public void AddDeploymentManager_WithConfiguration_BindsDeploymentManagerOptions() + { + var configuration = new ConfigurationBuilder() + .AddInMemoryCollection(new Dictionary + { + ["ScadaLink:DeploymentManager:OperationLockTimeout"] = "00:00:09", + ["ScadaLink:DeploymentManager:ArtifactDeploymentTimeoutPerSite"] = "00:03:00" + }) + .Build(); + + var services = new ServiceCollection(); + services.AddDeploymentManager(configuration); + + using var provider = services.BuildServiceProvider(); + var options = provider.GetRequiredService>().Value; + + Assert.Equal(TimeSpan.FromSeconds(9), options.OperationLockTimeout); + Assert.Equal(TimeSpan.FromMinutes(3), options.ArtifactDeploymentTimeoutPerSite); + } + + [Fact] + public void AddDeploymentManager_WithConfiguration_MissingSection_UsesDefaults() + { + var configuration = new ConfigurationBuilder().Build(); + + var services = new ServiceCollection(); + services.AddDeploymentManager(configuration); + + using var provider = services.BuildServiceProvider(); + var options = provider.GetRequiredService>().Value; + + // No section present -> the option-class defaults are retained. + Assert.Equal(TimeSpan.FromSeconds(5), options.OperationLockTimeout); + } +}