From e9ee4e3ea56aab4e200694ef53a0c1f29217aac4 Mon Sep 17 00:00:00 2001 From: Joseph Doherty Date: Sat, 16 May 2026 22:14:23 -0400 Subject: [PATCH] =?UTF-8?q?fix(deployment-manager):=20resolve=20Deployment?= =?UTF-8?q?Manager-009,010,012,014=20=E2=80=94=20shared=20deployment=20ID,?= =?UTF-8?q?=20lifecycle-timeout=20enforcement,=20doc/test=20cleanup;=20Dep?= =?UTF-8?q?loymentManager-013=20flagged?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- code-reviews/DeploymentManager/findings.md | 103 +++++++++++-- .../ArtifactDeploymentService.cs | 33 ++++- .../DeploymentManagerOptions.cs | 6 +- .../DeploymentService.cs | 50 ++++++- .../ArtifactDeploymentServiceTests.cs | 135 +++++++++++++++++- .../DeploymentServiceTests.cs | 53 +++++++ 6 files changed, 355 insertions(+), 25 deletions(-) diff --git a/code-reviews/DeploymentManager/findings.md b/code-reviews/DeploymentManager/findings.md index c906410..39d9cc5 100644 --- a/code-reviews/DeploymentManager/findings.md +++ b/code-reviews/DeploymentManager/findings.md @@ -8,7 +8,7 @@ | Last reviewed | 2026-05-16 | | Reviewer | claude-agent | | Commit reviewed | `9c60592` | -| Open findings | 5 | +| Open findings | 1 | ## Summary @@ -423,7 +423,7 @@ configuration binding. Regression tests: |--|--| | Severity | Low | | Category | Documentation & comments | -| Status | Open | +| Status | Resolved | | Location | `src/ScadaLink.DeploymentManager/DeploymentService.cs:288` | **Description** @@ -436,6 +436,9 @@ into the comment and not derived from any constant in this module. If `LifecycleTimeout` is reconfigured, the comment becomes wrong. It also wrongly implies the value lives in this module. +**Verification:** Confirmed against source. The `DeleteInstanceAsync` XML doc +quoted a hard-coded "30s" value. + **Recommendation** Reword to "Delete fails if the site is unreachable within @@ -443,7 +446,12 @@ Reword to "Delete fails if the site is unreachable within **Resolution** -_Unresolved._ +Resolved 2026-05-16 (commit pending): the `DeleteInstanceAsync` XML doc no +longer quotes a hard-coded "30s" — it now states delete fails if the site is +unreachable within `CommunicationOptions.LifecycleTimeout` (and notes the +deadline is applied inside `CommunicationService.DeleteInstanceAsync`). +Documentation-only change; no regression test (a test asserting comment text +would be meaningless). ### DeploymentManager-010 — `SystemArtifactDeploymentRecord` does not persist the deployment ID @@ -451,7 +459,7 @@ _Unresolved._ |--|--| | Severity | Low | | Category | Correctness & logic bugs | -| Status | Open | +| Status | Resolved | | Location | `src/ScadaLink.DeploymentManager/ArtifactDeploymentService.cs:136,194-211` | **Description** @@ -465,6 +473,9 @@ stored record. Additionally each per-site `DeployArtifactsCommand` carries its own separate GUID (`BuildDeployArtifactsCommandAsync` line 114), so there are in fact N+1 unrelated IDs for one logical artifact deployment. +**Verification:** Confirmed against source. Each per-site command minted its own +GUID and the persisted record had no way to reference the logical id. + **Recommendation** Add a `DeploymentId` column to `SystemArtifactDeploymentRecord` and store the @@ -473,7 +484,23 @@ per-site commands so the audit log, UI summary, and persisted record agree. **Resolution** -_Unresolved._ +Resolved 2026-05-16 (commit pending): `BuildDeployArtifactsCommandAsync` now +accepts an optional `deploymentId`, and `DeployToAllSitesAsync` passes the one +logical `deploymentId` to every per-site command — so the per-site commands, +the audit log, and the UI summary all reference a single id instead of N+1 +unrelated GUIDs (`RetryForSiteAsync`, an independent single-site retry, still +mints its own id). Adding a dedicated `DeploymentId` *column* to +`SystemArtifactDeploymentRecord` was deliberately **not** done: that entity +lives in `ScadaLink.Commons` with its EF mapping in +`ScadaLink.ConfigurationDatabase`, both outside this module's edit scope. +Instead the logical `deploymentId` is embedded in the record's free-form +`PerSiteStatus` JSON payload (`{ DeploymentId, Sites }`), which is fully within +this module's control, so the persisted record is correlatable with the +summary/audit. A follow-up to promote it to a first-class column should be +filed against Commons/ConfigurationDatabase if a queryable index is needed. +Regression tests: `DeployToAllSitesAsync_AllPerSiteCommandsShareTheSummaryDeploymentId`, +`DeployToAllSitesAsync_PartialFailure_ReportsPerSiteMatrix`, +`RetryForSiteAsync_SiteSucceeds_ReturnsSuccessAndAudits`. ### DeploymentManager-011 — Tests never exercise a successful deployment or lifecycle success path @@ -536,7 +563,7 @@ which asserts on `IAuditService.LogAsync`. Regression tests: |--|--| | Severity | Low | | Category | Documentation & comments | -| Status | Open | +| Status | Resolved | | Location | `src/ScadaLink.DeploymentManager/DeploymentManagerOptions.cs:8-9` | **Description** @@ -547,6 +574,9 @@ default and an XML doc, but it is never read anywhere in the codebase `CommunicationService`). The option misleads readers into thinking it controls disable/enable/delete timeouts, when setting it has no effect. +**Verification:** Confirmed against source. A repo-wide grep found exactly one +occurrence of `LifecycleCommandTimeout` — the declaration itself. + **Recommendation** Remove `LifecycleCommandTimeout`, or actually thread it through to the @@ -556,7 +586,21 @@ lifecycle command calls (e.g. by creating a linked CTS with this timeout in **Resolution** -_Unresolved._ +Resolved 2026-05-16 (commit pending): `LifecycleCommandTimeout` is now actually +threaded through (the option exists for tuning, so it was wired up rather than +deleted). `DisableInstanceAsync`/`EnableInstanceAsync`/`DeleteInstanceAsync` +each create a linked `CancellationTokenSource` with `CancelAfter( +_options.LifecycleCommandTimeout)` — the same pattern `ArtifactDeploymentService` +uses for `ArtifactDeploymentTimeoutPerSite` — and pass its token to the +`CommunicationService` call. Each method now catches the resulting +`TimeoutException`/`OperationCanceledException`, logs a warning, and returns a +`Result.Failure` (previously an `AskTimeoutException` from a hung site escaped +uncaught). The option's XML doc was corrected to describe the real behaviour. +Regression test: +`DisableInstanceAsync_SiteUnresponsive_LifecycleCommandTimeoutBoundsTheWait` +(asserts a 300 ms `LifecycleCommandTimeout` bounds the wait far below the 30 s +`CommunicationOptions.LifecycleTimeout`; confirmed to fail before the fix — +the call hung the full 30 s and threw `AskTimeoutException`). ### DeploymentManager-013 — SMTP credentials serialized and broadcast to all sites @@ -585,9 +629,35 @@ Confirm inter-cluster transport encryption covers artifact commands, ensure SMTP credentials on site SQLite. Consider encrypting the credential field within the artifact payload. +**Verification (2026-05-16):** Re-triaged against source. The DeploymentManager +side is **clean**: `ArtifactDeploymentService` maps `SmtpConfiguration.Credentials` +into the artifact (which the design explicitly mandates — SMTP configuration is +a deployable artifact) and **never logs it** — the three log statements in +`DeployToAllSitesAsync` only reference `SiteId`, `SiteName`, `DeploymentId`, and +`ex.Message`, never the credential. There is no defect to fix purely within +`src/ScadaLink.DeploymentManager`. The finding's remaining recommendations are +all cross-module and one needs a design decision: + - inter-cluster transport TLS — `ScadaLink.Communication` / + `ScadaLink.ClusterInfrastructure` (Akka remoting + ClusterClient config); + - at-rest encryption of the credential on site SQLite — `ScadaLink.SiteRuntime` + artifact store; + - encrypting the credential field inside the artifact payload — needs the + `SmtpConfigurationArtifact` shape in `ScadaLink.Commons` plus cooperating + producer (DeploymentManager) and consumer (SiteRuntime) changes, and a + **key-management design decision** (where the encryption key lives, how it + is distributed to sites) that cannot be made unilaterally here. + +**Status: Open — flagged.** No purely-DeploymentManager fix exists; the work +crosses Communication / SiteRuntime / Commons and requires a key-management +design decision. Severity confirmed Low: with TLS-protected inter-cluster +transport (a separate, assumed-in-place control) and no logging leak, this is a +hardening item, not an active leak. + **Resolution** -_Unresolved._ +_Unresolved — see Verification above. Left Open: requires cross-module +cooperation (Communication, SiteRuntime, Commons) and a key-management design +decision; out of scope for the DeploymentManager module._ ### DeploymentManager-014 — Dead `CreateCommand` helper in artifact tests @@ -595,7 +665,7 @@ _Unresolved._ |--|--| | Severity | Low | | Category | Testing coverage | -| Status | Open | +| Status | Resolved | | Location | `tests/ScadaLink.DeploymentManager.Tests/ArtifactDeploymentServiceTests.cs:86-90` | **Description** @@ -606,6 +676,10 @@ multi-site artifact deployment) was never written — coverage of `DeployToAllSitesAsync` is limited to the no-sites failure case, and `RetryForSiteAsync` and `BuildDeployArtifactsCommandAsync` have no tests at all. +**Verification:** Confirmed against source. The `CreateCommand()` helper had no +callers, and `DeployToAllSitesAsync`/`RetryForSiteAsync` only had the no-sites +failure case. + **Recommendation** Either remove the unused helper or, preferably, write the missing tests for @@ -614,4 +688,13 @@ Either remove the unused helper or, preferably, write the missing tests for **Resolution** -_Unresolved._ +Resolved 2026-05-16 (commit pending): took the recommendation's preferred +option — removed the dead `CreateCommand()` helper and wrote the missing +coverage instead. `ArtifactDeploymentServiceTests` now extends `TestKit` and +uses a stand-in `ArtifactProbeActor` (records the `DeployArtifactsCommand`s it +receives, replies success or, for a configured failure set, failure) so +`DeployToAllSitesAsync` and `RetryForSiteAsync` are exercised end-to-end past +the communication boundary. New tests: +`DeployToAllSitesAsync_AllPerSiteCommandsShareTheSummaryDeploymentId` (also +covers DeploymentManager-010), `DeployToAllSitesAsync_PartialFailure_ReportsPerSiteMatrix` +(per-site success/failure matrix), `RetryForSiteAsync_SiteSucceeds_ReturnsSuccessAndAudits`. diff --git a/src/ScadaLink.DeploymentManager/ArtifactDeploymentService.cs b/src/ScadaLink.DeploymentManager/ArtifactDeploymentService.cs index b622ce7..92ad5bb 100644 --- a/src/ScadaLink.DeploymentManager/ArtifactDeploymentService.cs +++ b/src/ScadaLink.DeploymentManager/ArtifactDeploymentService.cs @@ -58,9 +58,17 @@ public class ArtifactDeploymentService /// Collects all artifact types from repositories and builds a /// scoped to a specific site's data connections. /// + /// The DB id of the site whose data connections are collected. + /// + /// DeploymentManager-010: the logical deployment id for this artifact deployment. All per-site + /// commands of one call share this id so the audit log, + /// UI summary, and persisted record correlate. When null a fresh id is minted (used by + /// single-site retries). + /// public async Task BuildDeployArtifactsCommandAsync( int siteId, - CancellationToken cancellationToken = default) + CancellationToken cancellationToken = default, + string? deploymentId = null) { var sharedScripts = await _templateRepo.GetAllSharedScriptsAsync(cancellationToken); var externalSystems = await _externalSystemRepo.GetAllExternalSystemsAsync(cancellationToken); @@ -111,7 +119,7 @@ public class ArtifactDeploymentService smtp.Credentials, null, smtp.TlsMode)).ToList(); return new DeployArtifactsCommand( - Guid.NewGuid().ToString("N"), + deploymentId ?? Guid.NewGuid().ToString("N"), scriptArtifacts, externalSystemArtifacts, dbConnectionArtifacts, @@ -136,11 +144,15 @@ public class ArtifactDeploymentService var deploymentId = Guid.NewGuid().ToString("N"); var perSiteResults = new Dictionary(); - // Build per-site commands sequentially (DbContext is not thread-safe) + // Build per-site commands sequentially (DbContext is not thread-safe). + // DeploymentManager-010: every per-site command carries the SAME logical + // deploymentId, so the per-site commands, audit log, persisted record, + // and UI summary all reference one id instead of N+1 unrelated GUIDs. var siteCommands = new Dictionary(); foreach (var site in sites) { - siteCommands[site.Id] = await BuildDeployArtifactsCommandAsync(site.Id, cancellationToken); + siteCommands[site.Id] = await BuildDeployArtifactsCommandAsync( + site.Id, cancellationToken, deploymentId); } // Deploy to each site in parallel with per-site timeout @@ -190,11 +202,20 @@ public class ArtifactDeploymentService perSiteResults[result.SiteId] = result; } - // Persist the system artifact deployment record + // Persist the system artifact deployment record. + // DeploymentManager-010: SystemArtifactDeploymentRecord has no dedicated + // DeploymentId column (adding one is a Commons/ConfigurationDatabase + // schema change outside this module). The logical deploymentId is + // embedded in the PerSiteStatus payload so the persisted record can be + // correlated with the audit log and UI summary that report the same id. var record = new SystemArtifactDeploymentRecord("Artifacts", user) { DeployedAt = DateTimeOffset.UtcNow, - PerSiteStatus = JsonSerializer.Serialize(perSiteResults) + PerSiteStatus = JsonSerializer.Serialize(new + { + DeploymentId = deploymentId, + Sites = perSiteResults + }) }; await _deploymentRepo.AddSystemArtifactDeploymentAsync(record, cancellationToken); await _deploymentRepo.SaveChangesAsync(cancellationToken); diff --git a/src/ScadaLink.DeploymentManager/DeploymentManagerOptions.cs b/src/ScadaLink.DeploymentManager/DeploymentManagerOptions.cs index 61cc382..3d607d4 100644 --- a/src/ScadaLink.DeploymentManager/DeploymentManagerOptions.cs +++ b/src/ScadaLink.DeploymentManager/DeploymentManagerOptions.cs @@ -5,7 +5,11 @@ namespace ScadaLink.DeploymentManager; /// public class DeploymentManagerOptions { - /// Timeout for lifecycle commands sent to sites (disable, enable, delete). + /// + /// WP-6: Timeout for a lifecycle command round-trip (disable, enable, delete). + /// Applied as a linked-CTS deadline in DeploymentService so a hung or + /// unreachable site does not hold the per-instance operation lock indefinitely. + /// public TimeSpan LifecycleCommandTimeout { get; set; } = TimeSpan.FromSeconds(30); /// WP-7: Timeout per site for system-wide artifact deployment. diff --git a/src/ScadaLink.DeploymentManager/DeploymentService.cs b/src/ScadaLink.DeploymentManager/DeploymentService.cs index 6f3ad50..46180c9 100644 --- a/src/ScadaLink.DeploymentManager/DeploymentService.cs +++ b/src/ScadaLink.DeploymentManager/DeploymentService.cs @@ -302,7 +302,21 @@ public class DeploymentService var siteId = await ResolveSiteIdentifierAsync(instance.SiteId, cancellationToken); var command = new DisableInstanceCommand(commandId, instance.UniqueName, DateTimeOffset.UtcNow); - var response = await _communicationService.DisableInstanceAsync(siteId, command, cancellationToken); + // WP-6: bound the round-trip with the configured lifecycle timeout so a + // hung/unreachable site does not block the operation lock indefinitely. + InstanceLifecycleResponse response; + try + { + using var cts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken); + cts.CancelAfter(_options.LifecycleCommandTimeout); + response = await _communicationService.DisableInstanceAsync(siteId, command, cts.Token); + } + catch (Exception ex) when (ex is TimeoutException or OperationCanceledException) + { + _logger.LogWarning(ex, "Disable of instance {Instance} timed out", instance.UniqueName); + return Result.Failure( + $"Disable failed: the site did not respond within {_options.LifecycleCommandTimeout}."); + } if (response.Success) { @@ -343,7 +357,20 @@ public class DeploymentService var siteId = await ResolveSiteIdentifierAsync(instance.SiteId, cancellationToken); var command = new EnableInstanceCommand(commandId, instance.UniqueName, DateTimeOffset.UtcNow); - var response = await _communicationService.EnableInstanceAsync(siteId, command, cancellationToken); + // WP-6: bound the round-trip with the configured lifecycle timeout. + InstanceLifecycleResponse response; + try + { + using var cts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken); + cts.CancelAfter(_options.LifecycleCommandTimeout); + response = await _communicationService.EnableInstanceAsync(siteId, command, cts.Token); + } + catch (Exception ex) when (ex is TimeoutException or OperationCanceledException) + { + _logger.LogWarning(ex, "Enable of instance {Instance} timed out", instance.UniqueName); + return Result.Failure( + $"Enable failed: the site did not respond within {_options.LifecycleCommandTimeout}."); + } if (response.Success) { @@ -365,7 +392,9 @@ public class DeploymentService /// WP-6: Delete an instance. Stops the site actor, removes site config, and /// removes the central instance record (deployment history, snapshot, /// overrides, and connection bindings go with it). S&F NOT cleared. - /// Delete fails if site unreachable (30s timeout via CommunicationOptions). + /// Delete fails if the site is unreachable within + /// CommunicationOptions.LifecycleTimeout (applied inside + /// ). /// public async Task> DeleteInstanceAsync( int instanceId, @@ -387,7 +416,20 @@ public class DeploymentService var siteId = await ResolveSiteIdentifierAsync(instance.SiteId, cancellationToken); var command = new DeleteInstanceCommand(commandId, instance.UniqueName, DateTimeOffset.UtcNow); - var response = await _communicationService.DeleteInstanceAsync(siteId, command, cancellationToken); + // WP-6: bound the round-trip with the configured lifecycle timeout. + InstanceLifecycleResponse response; + try + { + using var cts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken); + cts.CancelAfter(_options.LifecycleCommandTimeout); + response = await _communicationService.DeleteInstanceAsync(siteId, command, cts.Token); + } + catch (Exception ex) when (ex is TimeoutException or OperationCanceledException) + { + _logger.LogWarning(ex, "Delete of instance {Instance} timed out", instance.UniqueName); + return Result.Failure( + $"Delete failed: the site did not respond within {_options.LifecycleCommandTimeout}."); + } if (response.Success) { diff --git a/tests/ScadaLink.DeploymentManager.Tests/ArtifactDeploymentServiceTests.cs b/tests/ScadaLink.DeploymentManager.Tests/ArtifactDeploymentServiceTests.cs index 52b083f..3c7e94b 100644 --- a/tests/ScadaLink.DeploymentManager.Tests/ArtifactDeploymentServiceTests.cs +++ b/tests/ScadaLink.DeploymentManager.Tests/ArtifactDeploymentServiceTests.cs @@ -1,6 +1,11 @@ +using System.Collections.Concurrent; +using System.Text.Json; +using Akka.Actor; +using Akka.TestKit.Xunit2; using Microsoft.Extensions.Logging.Abstractions; using Microsoft.Extensions.Options; using NSubstitute; +using ScadaLink.Commons.Entities.Deployment; using ScadaLink.Commons.Entities.Sites; using ScadaLink.Commons.Interfaces.Repositories; using ScadaLink.Commons.Interfaces.Services; @@ -12,7 +17,7 @@ namespace ScadaLink.DeploymentManager.Tests; /// /// WP-7: Tests for system-wide artifact deployment. /// -public class ArtifactDeploymentServiceTests +public class ArtifactDeploymentServiceTests : TestKit { private readonly ISiteRepository _siteRepo; private readonly IDeploymentManagerRepository _deploymentRepo; @@ -70,6 +75,86 @@ public class ArtifactDeploymentServiceTests Assert.Equal(3, summary.SiteResults.Count); } + // ── DeploymentManager-010: one logical deployment id across all per-site commands ── + + [Fact] + public async Task DeployToAllSitesAsync_AllPerSiteCommandsShareTheSummaryDeploymentId() + { + // DeploymentManager-010: previously each per-site DeployArtifactsCommand + // minted its own GUID, so one logical deployment produced N+1 unrelated + // ids. Every per-site command must now carry the SAME id, equal to the + // id reported in the summary and audit log. + var sites = new List + { + new("Site One", "site-1") { Id = 1 }, + new("Site Two", "site-2") { Id = 2 } + }; + _siteRepo.GetAllSitesAsync(Arg.Any()).Returns(sites); + var probe = Sys.ActorOf(Props.Create(() => new ArtifactProbeActor())); + var service = CreateServiceWithCommActor(probe); + + var result = await service.DeployToAllSitesAsync("admin"); + + Assert.True(result.IsSuccess); + var commands = ArtifactProbeActor.Received; + Assert.Equal(2, commands.Count); + + // All per-site commands carry one shared id, equal to the summary id. + var distinctIds = commands.Select(c => c.DeploymentId).Distinct().ToList(); + Assert.Single(distinctIds); + Assert.Equal(result.Value.DeploymentId, distinctIds[0]); + + // The persisted record embeds the same logical deployment id. + await _deploymentRepo.Received().AddSystemArtifactDeploymentAsync( + Arg.Do(r => + { + using var doc = JsonDocument.Parse(r.PerSiteStatus!); + Assert.Equal(result.Value.DeploymentId, + doc.RootElement.GetProperty("DeploymentId").GetString()); + }), + Arg.Any()); + } + + // ── DeploymentManager-014: real per-site success/failure coverage ── + + [Fact] + public async Task DeployToAllSitesAsync_PartialFailure_ReportsPerSiteMatrix() + { + // Site one succeeds, site two fails -> the summary counts must reflect + // the per-site matrix. + var sites = new List + { + new("Site One", "ok-site") { Id = 1 }, + new("Site Two", "fail-site") { Id = 2 } + }; + _siteRepo.GetAllSitesAsync(Arg.Any()).Returns(sites); + var probe = Sys.ActorOf(Props.Create(() => new ArtifactProbeActor("fail-site"))); + var service = CreateServiceWithCommActor(probe); + + var result = await service.DeployToAllSitesAsync("admin"); + + Assert.True(result.IsSuccess); + Assert.Equal(1, result.Value.SuccessCount); + Assert.Equal(1, result.Value.FailureCount); + Assert.Contains(result.Value.SiteResults, r => r.SiteId == "ok-site" && r.Success); + Assert.Contains(result.Value.SiteResults, r => r.SiteId == "fail-site" && !r.Success); + } + + [Fact] + public async Task RetryForSiteAsync_SiteSucceeds_ReturnsSuccessAndAudits() + { + var probe = Sys.ActorOf(Props.Create(() => new ArtifactProbeActor())); + var service = CreateServiceWithCommActor(probe); + + var result = await service.RetryForSiteAsync(1, "retry-site", "admin"); + + Assert.True(result.IsSuccess); + Assert.Equal("retry-site", result.Value.SiteId); + await _audit.Received().LogAsync( + "admin", "RetryArtifactDeployment", "SystemArtifact", + Arg.Any(), "retry-site", Arg.Any(), Arg.Any()); + } + private ArtifactDeploymentService CreateService() { var comms = new CommunicationService( @@ -83,9 +168,51 @@ public class ArtifactDeploymentServiceTests NullLogger.Instance); } - private static DeployArtifactsCommand CreateCommand() + private ArtifactDeploymentService CreateServiceWithCommActor(IActorRef commActor) { - return new DeployArtifactsCommand( - "dep1", null, null, null, null, null, null, DateTimeOffset.UtcNow); + var comms = new CommunicationService( + Options.Create(new CommunicationOptions + { + ArtifactDeploymentTimeout = TimeSpan.FromSeconds(5) + }), + NullLogger.Instance); + comms.SetCommunicationActor(commActor); + + return new ArtifactDeploymentService( + _siteRepo, _deploymentRepo, _templateRepo, _externalSystemRepo, _notificationRepo, + comms, _audit, + Options.Create(new DeploymentManagerOptions + { + ArtifactDeploymentTimeoutPerSite = TimeSpan.FromSeconds(5) + }), + NullLogger.Instance); + } + + /// + /// Stand-in CentralCommunicationActor for artifact deployment. Records every + /// it receives and replies success + /// unless the target site id is in the configured failure set. + /// + private class ArtifactProbeActor : ReceiveActor + { + public static readonly ConcurrentBag Received = new(); + + public ArtifactProbeActor(params string[] failingSites) + { + Received.Clear(); + var failSet = new HashSet(failingSites); + + Receive(env => + { + if (env.Message is DeployArtifactsCommand cmd) + { + Received.Add(cmd); + var success = !failSet.Contains(env.SiteId); + Sender.Tell(new ArtifactDeploymentResponse( + cmd.DeploymentId, env.SiteId, success, + success ? null : "site rejected artifacts", DateTimeOffset.UtcNow)); + } + }); + } } } diff --git a/tests/ScadaLink.DeploymentManager.Tests/DeploymentServiceTests.cs b/tests/ScadaLink.DeploymentManager.Tests/DeploymentServiceTests.cs index 824bd13..4276b3d 100644 --- a/tests/ScadaLink.DeploymentManager.Tests/DeploymentServiceTests.cs +++ b/tests/ScadaLink.DeploymentManager.Tests/DeploymentServiceTests.cs @@ -763,6 +763,59 @@ public class DeploymentServiceTests : TestKit Assert.Equal(1, ReconcileProbeActor.DeployCount); } + // ── DeploymentManager-012: LifecycleCommandTimeout must actually bound lifecycle commands ── + + [Fact] + public async Task DisableInstanceAsync_SiteUnresponsive_LifecycleCommandTimeoutBoundsTheWait() + { + // The site never replies to the DisableInstanceCommand. A short + // LifecycleCommandTimeout must abort the wait quickly -- if the option + // is dead code the call would instead hang until CommunicationOptions + // .LifecycleTimeout (much longer) elapses. + var instance = new Instance("StuckInst") { Id = 60, SiteId = 1, State = InstanceState.Enabled }; + _repo.GetInstanceByIdAsync(60, Arg.Any()).Returns(instance); + + // Probe drops every message -> no reply ever arrives. + var commActor = Sys.ActorOf(Props.Create(() => new SilentProbeActor())); + + var comms = new CommunicationService( + Options.Create(new CommunicationOptions + { + // Long communication-layer timeout: if LifecycleCommandTimeout + // were dead, the test would wait this long. + LifecycleTimeout = TimeSpan.FromSeconds(30) + }), + NullLogger.Instance); + comms.SetCommunicationActor(commActor); + + var siteRepo = Substitute.For(); + var service = new DeploymentService( + _repo, siteRepo, _pipeline, comms, _lockManager, _audit, + new DiffService(), + Options.Create(new DeploymentManagerOptions + { + OperationLockTimeout = TimeSpan.FromSeconds(5), + LifecycleCommandTimeout = TimeSpan.FromMilliseconds(300) + }), + NullLogger.Instance); + + var sw = System.Diagnostics.Stopwatch.StartNew(); + var result = await service.DisableInstanceAsync(60, "admin"); + sw.Stop(); + + Assert.True(result.IsFailure); + // The 300ms LifecycleCommandTimeout bounded the wait well under the + // 30s communication-layer timeout. + Assert.True(sw.Elapsed < TimeSpan.FromSeconds(10), + $"Lifecycle command was not bounded by LifecycleCommandTimeout (took {sw.Elapsed})."); + } + + /// Stand-in actor that never replies to anything. + private class SilentProbeActor : ReceiveActor + { + public SilentProbeActor() => ReceiveAny(_ => { }); + } + // ── DeploymentManager-003: post-success persistence must commit the Success status ── [Fact]