diff --git a/code-reviews/DeploymentManager/findings.md b/code-reviews/DeploymentManager/findings.md
index c906410..39d9cc5 100644
--- a/code-reviews/DeploymentManager/findings.md
+++ b/code-reviews/DeploymentManager/findings.md
@@ -8,7 +8,7 @@
| Last reviewed | 2026-05-16 |
| Reviewer | claude-agent |
| Commit reviewed | `9c60592` |
-| Open findings | 5 |
+| Open findings | 1 |
## Summary
@@ -423,7 +423,7 @@ configuration binding. Regression tests:
|--|--|
| Severity | Low |
| Category | Documentation & comments |
-| Status | Open |
+| Status | Resolved |
| Location | `src/ScadaLink.DeploymentManager/DeploymentService.cs:288` |
**Description**
@@ -436,6 +436,9 @@ into the comment and not derived from any constant in this module. If
`LifecycleTimeout` is reconfigured, the comment becomes wrong. It also wrongly
implies the value lives in this module.
+**Verification:** Confirmed against source. The `DeleteInstanceAsync` XML doc
+quoted a hard-coded "30s" value.
+
**Recommendation**
Reword to "Delete fails if the site is unreachable within
@@ -443,7 +446,12 @@ Reword to "Delete fails if the site is unreachable within
**Resolution**
-_Unresolved._
+Resolved 2026-05-16 (commit pending): the `DeleteInstanceAsync` XML doc no
+longer quotes a hard-coded "30s" — it now states delete fails if the site is
+unreachable within `CommunicationOptions.LifecycleTimeout` (and notes the
+deadline is applied inside `CommunicationService.DeleteInstanceAsync`).
+Documentation-only change; no regression test (a test asserting comment text
+would be meaningless).
### DeploymentManager-010 — `SystemArtifactDeploymentRecord` does not persist the deployment ID
@@ -451,7 +459,7 @@ _Unresolved._
|--|--|
| Severity | Low |
| Category | Correctness & logic bugs |
-| Status | Open |
+| Status | Resolved |
| Location | `src/ScadaLink.DeploymentManager/ArtifactDeploymentService.cs:136,194-211` |
**Description**
@@ -465,6 +473,9 @@ stored record. Additionally each per-site `DeployArtifactsCommand` carries its
own separate GUID (`BuildDeployArtifactsCommandAsync` line 114), so there are in
fact N+1 unrelated IDs for one logical artifact deployment.
+**Verification:** Confirmed against source. Each per-site command minted its own
+GUID and the persisted record had no way to reference the logical id.
+
**Recommendation**
Add a `DeploymentId` column to `SystemArtifactDeploymentRecord` and store the
@@ -473,7 +484,23 @@ per-site commands so the audit log, UI summary, and persisted record agree.
**Resolution**
-_Unresolved._
+Resolved 2026-05-16 (commit pending): `BuildDeployArtifactsCommandAsync` now
+accepts an optional `deploymentId`, and `DeployToAllSitesAsync` passes the one
+logical `deploymentId` to every per-site command — so the per-site commands,
+the audit log, and the UI summary all reference a single id instead of N+1
+unrelated GUIDs (`RetryForSiteAsync`, an independent single-site retry, still
+mints its own id). Adding a dedicated `DeploymentId` *column* to
+`SystemArtifactDeploymentRecord` was deliberately **not** done: that entity
+lives in `ScadaLink.Commons` with its EF mapping in
+`ScadaLink.ConfigurationDatabase`, both outside this module's edit scope.
+Instead the logical `deploymentId` is embedded in the record's free-form
+`PerSiteStatus` JSON payload (`{ DeploymentId, Sites }`), which is fully within
+this module's control, so the persisted record is correlatable with the
+summary/audit. A follow-up to promote it to a first-class column should be
+filed against Commons/ConfigurationDatabase if a queryable index is needed.
+Regression tests: `DeployToAllSitesAsync_AllPerSiteCommandsShareTheSummaryDeploymentId`,
+`DeployToAllSitesAsync_PartialFailure_ReportsPerSiteMatrix`,
+`RetryForSiteAsync_SiteSucceeds_ReturnsSuccessAndAudits`.
### DeploymentManager-011 — Tests never exercise a successful deployment or lifecycle success path
@@ -536,7 +563,7 @@ which asserts on `IAuditService.LogAsync`. Regression tests:
|--|--|
| Severity | Low |
| Category | Documentation & comments |
-| Status | Open |
+| Status | Resolved |
| Location | `src/ScadaLink.DeploymentManager/DeploymentManagerOptions.cs:8-9` |
**Description**
@@ -547,6 +574,9 @@ default and an XML doc, but it is never read anywhere in the codebase
`CommunicationService`). The option misleads readers into thinking it controls
disable/enable/delete timeouts, when setting it has no effect.
+**Verification:** Confirmed against source. A repo-wide grep found exactly one
+occurrence of `LifecycleCommandTimeout` — the declaration itself.
+
**Recommendation**
Remove `LifecycleCommandTimeout`, or actually thread it through to the
@@ -556,7 +586,21 @@ lifecycle command calls (e.g. by creating a linked CTS with this timeout in
**Resolution**
-_Unresolved._
+Resolved 2026-05-16 (commit pending): `LifecycleCommandTimeout` is now actually
+threaded through (the option exists for tuning, so it was wired up rather than
+deleted). `DisableInstanceAsync`/`EnableInstanceAsync`/`DeleteInstanceAsync`
+each create a linked `CancellationTokenSource` with `CancelAfter(
+_options.LifecycleCommandTimeout)` — the same pattern `ArtifactDeploymentService`
+uses for `ArtifactDeploymentTimeoutPerSite` — and pass its token to the
+`CommunicationService` call. Each method now catches the resulting
+`TimeoutException`/`OperationCanceledException`, logs a warning, and returns a
+`Result.Failure` (previously an `AskTimeoutException` from a hung site escaped
+uncaught). The option's XML doc was corrected to describe the real behaviour.
+Regression test:
+`DisableInstanceAsync_SiteUnresponsive_LifecycleCommandTimeoutBoundsTheWait`
+(asserts a 300 ms `LifecycleCommandTimeout` bounds the wait far below the 30 s
+`CommunicationOptions.LifecycleTimeout`; confirmed to fail before the fix —
+the call hung the full 30 s and threw `AskTimeoutException`).
### DeploymentManager-013 — SMTP credentials serialized and broadcast to all sites
@@ -585,9 +629,35 @@ Confirm inter-cluster transport encryption covers artifact commands, ensure
SMTP credentials on site SQLite. Consider encrypting the credential field
within the artifact payload.
+**Verification (2026-05-16):** Re-triaged against source. The DeploymentManager
+side is **clean**: `ArtifactDeploymentService` maps `SmtpConfiguration.Credentials`
+into the artifact (which the design explicitly mandates — SMTP configuration is
+a deployable artifact) and **never logs it** — the three log statements in
+`DeployToAllSitesAsync` only reference `SiteId`, `SiteName`, `DeploymentId`, and
+`ex.Message`, never the credential. There is no defect to fix purely within
+`src/ScadaLink.DeploymentManager`. The finding's remaining recommendations are
+all cross-module and one needs a design decision:
+ - inter-cluster transport TLS — `ScadaLink.Communication` /
+ `ScadaLink.ClusterInfrastructure` (Akka remoting + ClusterClient config);
+ - at-rest encryption of the credential on site SQLite — `ScadaLink.SiteRuntime`
+ artifact store;
+ - encrypting the credential field inside the artifact payload — needs the
+ `SmtpConfigurationArtifact` shape in `ScadaLink.Commons` plus cooperating
+ producer (DeploymentManager) and consumer (SiteRuntime) changes, and a
+ **key-management design decision** (where the encryption key lives, how it
+ is distributed to sites) that cannot be made unilaterally here.
+
+**Status: Open — flagged.** No purely-DeploymentManager fix exists; the work
+crosses Communication / SiteRuntime / Commons and requires a key-management
+design decision. Severity confirmed Low: with TLS-protected inter-cluster
+transport (a separate, assumed-in-place control) and no logging leak, this is a
+hardening item, not an active leak.
+
**Resolution**
-_Unresolved._
+_Unresolved — see Verification above. Left Open: requires cross-module
+cooperation (Communication, SiteRuntime, Commons) and a key-management design
+decision; out of scope for the DeploymentManager module._
### DeploymentManager-014 — Dead `CreateCommand` helper in artifact tests
@@ -595,7 +665,7 @@ _Unresolved._
|--|--|
| Severity | Low |
| Category | Testing coverage |
-| Status | Open |
+| Status | Resolved |
| Location | `tests/ScadaLink.DeploymentManager.Tests/ArtifactDeploymentServiceTests.cs:86-90` |
**Description**
@@ -606,6 +676,10 @@ multi-site artifact deployment) was never written — coverage of
`DeployToAllSitesAsync` is limited to the no-sites failure case, and
`RetryForSiteAsync` and `BuildDeployArtifactsCommandAsync` have no tests at all.
+**Verification:** Confirmed against source. The `CreateCommand()` helper had no
+callers, and `DeployToAllSitesAsync`/`RetryForSiteAsync` only had the no-sites
+failure case.
+
**Recommendation**
Either remove the unused helper or, preferably, write the missing tests for
@@ -614,4 +688,13 @@ Either remove the unused helper or, preferably, write the missing tests for
**Resolution**
-_Unresolved._
+Resolved 2026-05-16 (commit pending): took the recommendation's preferred
+option — removed the dead `CreateCommand()` helper and wrote the missing
+coverage instead. `ArtifactDeploymentServiceTests` now extends `TestKit` and
+uses a stand-in `ArtifactProbeActor` (records the `DeployArtifactsCommand`s it
+receives, replies success or, for a configured failure set, failure) so
+`DeployToAllSitesAsync` and `RetryForSiteAsync` are exercised end-to-end past
+the communication boundary. New tests:
+`DeployToAllSitesAsync_AllPerSiteCommandsShareTheSummaryDeploymentId` (also
+covers DeploymentManager-010), `DeployToAllSitesAsync_PartialFailure_ReportsPerSiteMatrix`
+(per-site success/failure matrix), `RetryForSiteAsync_SiteSucceeds_ReturnsSuccessAndAudits`.
diff --git a/src/ScadaLink.DeploymentManager/ArtifactDeploymentService.cs b/src/ScadaLink.DeploymentManager/ArtifactDeploymentService.cs
index b622ce7..92ad5bb 100644
--- a/src/ScadaLink.DeploymentManager/ArtifactDeploymentService.cs
+++ b/src/ScadaLink.DeploymentManager/ArtifactDeploymentService.cs
@@ -58,9 +58,17 @@ public class ArtifactDeploymentService
/// Collects all artifact types from repositories and builds a
/// scoped to a specific site's data connections.
///
+ /// The DB id of the site whose data connections are collected.
+ ///
+ /// DeploymentManager-010: the logical deployment id for this artifact deployment. All per-site
+ /// commands of one call share this id so the audit log,
+ /// UI summary, and persisted record correlate. When null a fresh id is minted (used by
+ /// single-site retries).
+ ///
public async Task BuildDeployArtifactsCommandAsync(
int siteId,
- CancellationToken cancellationToken = default)
+ CancellationToken cancellationToken = default,
+ string? deploymentId = null)
{
var sharedScripts = await _templateRepo.GetAllSharedScriptsAsync(cancellationToken);
var externalSystems = await _externalSystemRepo.GetAllExternalSystemsAsync(cancellationToken);
@@ -111,7 +119,7 @@ public class ArtifactDeploymentService
smtp.Credentials, null, smtp.TlsMode)).ToList();
return new DeployArtifactsCommand(
- Guid.NewGuid().ToString("N"),
+ deploymentId ?? Guid.NewGuid().ToString("N"),
scriptArtifacts,
externalSystemArtifacts,
dbConnectionArtifacts,
@@ -136,11 +144,15 @@ public class ArtifactDeploymentService
var deploymentId = Guid.NewGuid().ToString("N");
var perSiteResults = new Dictionary();
- // Build per-site commands sequentially (DbContext is not thread-safe)
+ // Build per-site commands sequentially (DbContext is not thread-safe).
+ // DeploymentManager-010: every per-site command carries the SAME logical
+ // deploymentId, so the per-site commands, audit log, persisted record,
+ // and UI summary all reference one id instead of N+1 unrelated GUIDs.
var siteCommands = new Dictionary();
foreach (var site in sites)
{
- siteCommands[site.Id] = await BuildDeployArtifactsCommandAsync(site.Id, cancellationToken);
+ siteCommands[site.Id] = await BuildDeployArtifactsCommandAsync(
+ site.Id, cancellationToken, deploymentId);
}
// Deploy to each site in parallel with per-site timeout
@@ -190,11 +202,20 @@ public class ArtifactDeploymentService
perSiteResults[result.SiteId] = result;
}
- // Persist the system artifact deployment record
+ // Persist the system artifact deployment record.
+ // DeploymentManager-010: SystemArtifactDeploymentRecord has no dedicated
+ // DeploymentId column (adding one is a Commons/ConfigurationDatabase
+ // schema change outside this module). The logical deploymentId is
+ // embedded in the PerSiteStatus payload so the persisted record can be
+ // correlated with the audit log and UI summary that report the same id.
var record = new SystemArtifactDeploymentRecord("Artifacts", user)
{
DeployedAt = DateTimeOffset.UtcNow,
- PerSiteStatus = JsonSerializer.Serialize(perSiteResults)
+ PerSiteStatus = JsonSerializer.Serialize(new
+ {
+ DeploymentId = deploymentId,
+ Sites = perSiteResults
+ })
};
await _deploymentRepo.AddSystemArtifactDeploymentAsync(record, cancellationToken);
await _deploymentRepo.SaveChangesAsync(cancellationToken);
diff --git a/src/ScadaLink.DeploymentManager/DeploymentManagerOptions.cs b/src/ScadaLink.DeploymentManager/DeploymentManagerOptions.cs
index 61cc382..3d607d4 100644
--- a/src/ScadaLink.DeploymentManager/DeploymentManagerOptions.cs
+++ b/src/ScadaLink.DeploymentManager/DeploymentManagerOptions.cs
@@ -5,7 +5,11 @@ namespace ScadaLink.DeploymentManager;
///
public class DeploymentManagerOptions
{
- /// Timeout for lifecycle commands sent to sites (disable, enable, delete).
+ ///
+ /// WP-6: Timeout for a lifecycle command round-trip (disable, enable, delete).
+ /// Applied as a linked-CTS deadline in DeploymentService so a hung or
+ /// unreachable site does not hold the per-instance operation lock indefinitely.
+ ///
public TimeSpan LifecycleCommandTimeout { get; set; } = TimeSpan.FromSeconds(30);
/// WP-7: Timeout per site for system-wide artifact deployment.
diff --git a/src/ScadaLink.DeploymentManager/DeploymentService.cs b/src/ScadaLink.DeploymentManager/DeploymentService.cs
index 6f3ad50..46180c9 100644
--- a/src/ScadaLink.DeploymentManager/DeploymentService.cs
+++ b/src/ScadaLink.DeploymentManager/DeploymentService.cs
@@ -302,7 +302,21 @@ public class DeploymentService
var siteId = await ResolveSiteIdentifierAsync(instance.SiteId, cancellationToken);
var command = new DisableInstanceCommand(commandId, instance.UniqueName, DateTimeOffset.UtcNow);
- var response = await _communicationService.DisableInstanceAsync(siteId, command, cancellationToken);
+ // WP-6: bound the round-trip with the configured lifecycle timeout so a
+ // hung/unreachable site does not block the operation lock indefinitely.
+ InstanceLifecycleResponse response;
+ try
+ {
+ using var cts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken);
+ cts.CancelAfter(_options.LifecycleCommandTimeout);
+ response = await _communicationService.DisableInstanceAsync(siteId, command, cts.Token);
+ }
+ catch (Exception ex) when (ex is TimeoutException or OperationCanceledException)
+ {
+ _logger.LogWarning(ex, "Disable of instance {Instance} timed out", instance.UniqueName);
+ return Result.Failure(
+ $"Disable failed: the site did not respond within {_options.LifecycleCommandTimeout}.");
+ }
if (response.Success)
{
@@ -343,7 +357,20 @@ public class DeploymentService
var siteId = await ResolveSiteIdentifierAsync(instance.SiteId, cancellationToken);
var command = new EnableInstanceCommand(commandId, instance.UniqueName, DateTimeOffset.UtcNow);
- var response = await _communicationService.EnableInstanceAsync(siteId, command, cancellationToken);
+ // WP-6: bound the round-trip with the configured lifecycle timeout.
+ InstanceLifecycleResponse response;
+ try
+ {
+ using var cts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken);
+ cts.CancelAfter(_options.LifecycleCommandTimeout);
+ response = await _communicationService.EnableInstanceAsync(siteId, command, cts.Token);
+ }
+ catch (Exception ex) when (ex is TimeoutException or OperationCanceledException)
+ {
+ _logger.LogWarning(ex, "Enable of instance {Instance} timed out", instance.UniqueName);
+ return Result.Failure(
+ $"Enable failed: the site did not respond within {_options.LifecycleCommandTimeout}.");
+ }
if (response.Success)
{
@@ -365,7 +392,9 @@ public class DeploymentService
/// WP-6: Delete an instance. Stops the site actor, removes site config, and
/// removes the central instance record (deployment history, snapshot,
/// overrides, and connection bindings go with it). S&F NOT cleared.
- /// Delete fails if site unreachable (30s timeout via CommunicationOptions).
+ /// Delete fails if the site is unreachable within
+ /// CommunicationOptions.LifecycleTimeout (applied inside
+ /// ).
///
public async Task> DeleteInstanceAsync(
int instanceId,
@@ -387,7 +416,20 @@ public class DeploymentService
var siteId = await ResolveSiteIdentifierAsync(instance.SiteId, cancellationToken);
var command = new DeleteInstanceCommand(commandId, instance.UniqueName, DateTimeOffset.UtcNow);
- var response = await _communicationService.DeleteInstanceAsync(siteId, command, cancellationToken);
+ // WP-6: bound the round-trip with the configured lifecycle timeout.
+ InstanceLifecycleResponse response;
+ try
+ {
+ using var cts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken);
+ cts.CancelAfter(_options.LifecycleCommandTimeout);
+ response = await _communicationService.DeleteInstanceAsync(siteId, command, cts.Token);
+ }
+ catch (Exception ex) when (ex is TimeoutException or OperationCanceledException)
+ {
+ _logger.LogWarning(ex, "Delete of instance {Instance} timed out", instance.UniqueName);
+ return Result.Failure(
+ $"Delete failed: the site did not respond within {_options.LifecycleCommandTimeout}.");
+ }
if (response.Success)
{
diff --git a/tests/ScadaLink.DeploymentManager.Tests/ArtifactDeploymentServiceTests.cs b/tests/ScadaLink.DeploymentManager.Tests/ArtifactDeploymentServiceTests.cs
index 52b083f..3c7e94b 100644
--- a/tests/ScadaLink.DeploymentManager.Tests/ArtifactDeploymentServiceTests.cs
+++ b/tests/ScadaLink.DeploymentManager.Tests/ArtifactDeploymentServiceTests.cs
@@ -1,6 +1,11 @@
+using System.Collections.Concurrent;
+using System.Text.Json;
+using Akka.Actor;
+using Akka.TestKit.Xunit2;
using Microsoft.Extensions.Logging.Abstractions;
using Microsoft.Extensions.Options;
using NSubstitute;
+using ScadaLink.Commons.Entities.Deployment;
using ScadaLink.Commons.Entities.Sites;
using ScadaLink.Commons.Interfaces.Repositories;
using ScadaLink.Commons.Interfaces.Services;
@@ -12,7 +17,7 @@ namespace ScadaLink.DeploymentManager.Tests;
///
/// WP-7: Tests for system-wide artifact deployment.
///
-public class ArtifactDeploymentServiceTests
+public class ArtifactDeploymentServiceTests : TestKit
{
private readonly ISiteRepository _siteRepo;
private readonly IDeploymentManagerRepository _deploymentRepo;
@@ -70,6 +75,86 @@ public class ArtifactDeploymentServiceTests
Assert.Equal(3, summary.SiteResults.Count);
}
+ // ── DeploymentManager-010: one logical deployment id across all per-site commands ──
+
+ [Fact]
+ public async Task DeployToAllSitesAsync_AllPerSiteCommandsShareTheSummaryDeploymentId()
+ {
+ // DeploymentManager-010: previously each per-site DeployArtifactsCommand
+ // minted its own GUID, so one logical deployment produced N+1 unrelated
+ // ids. Every per-site command must now carry the SAME id, equal to the
+ // id reported in the summary and audit log.
+ var sites = new List
+ {
+ new("Site One", "site-1") { Id = 1 },
+ new("Site Two", "site-2") { Id = 2 }
+ };
+ _siteRepo.GetAllSitesAsync(Arg.Any()).Returns(sites);
+ var probe = Sys.ActorOf(Props.Create(() => new ArtifactProbeActor()));
+ var service = CreateServiceWithCommActor(probe);
+
+ var result = await service.DeployToAllSitesAsync("admin");
+
+ Assert.True(result.IsSuccess);
+ var commands = ArtifactProbeActor.Received;
+ Assert.Equal(2, commands.Count);
+
+ // All per-site commands carry one shared id, equal to the summary id.
+ var distinctIds = commands.Select(c => c.DeploymentId).Distinct().ToList();
+ Assert.Single(distinctIds);
+ Assert.Equal(result.Value.DeploymentId, distinctIds[0]);
+
+ // The persisted record embeds the same logical deployment id.
+ await _deploymentRepo.Received().AddSystemArtifactDeploymentAsync(
+ Arg.Do(r =>
+ {
+ using var doc = JsonDocument.Parse(r.PerSiteStatus!);
+ Assert.Equal(result.Value.DeploymentId,
+ doc.RootElement.GetProperty("DeploymentId").GetString());
+ }),
+ Arg.Any());
+ }
+
+ // ── DeploymentManager-014: real per-site success/failure coverage ──
+
+ [Fact]
+ public async Task DeployToAllSitesAsync_PartialFailure_ReportsPerSiteMatrix()
+ {
+ // Site one succeeds, site two fails -> the summary counts must reflect
+ // the per-site matrix.
+ var sites = new List
+ {
+ new("Site One", "ok-site") { Id = 1 },
+ new("Site Two", "fail-site") { Id = 2 }
+ };
+ _siteRepo.GetAllSitesAsync(Arg.Any()).Returns(sites);
+ var probe = Sys.ActorOf(Props.Create(() => new ArtifactProbeActor("fail-site")));
+ var service = CreateServiceWithCommActor(probe);
+
+ var result = await service.DeployToAllSitesAsync("admin");
+
+ Assert.True(result.IsSuccess);
+ Assert.Equal(1, result.Value.SuccessCount);
+ Assert.Equal(1, result.Value.FailureCount);
+ Assert.Contains(result.Value.SiteResults, r => r.SiteId == "ok-site" && r.Success);
+ Assert.Contains(result.Value.SiteResults, r => r.SiteId == "fail-site" && !r.Success);
+ }
+
+ [Fact]
+ public async Task RetryForSiteAsync_SiteSucceeds_ReturnsSuccessAndAudits()
+ {
+ var probe = Sys.ActorOf(Props.Create(() => new ArtifactProbeActor()));
+ var service = CreateServiceWithCommActor(probe);
+
+ var result = await service.RetryForSiteAsync(1, "retry-site", "admin");
+
+ Assert.True(result.IsSuccess);
+ Assert.Equal("retry-site", result.Value.SiteId);
+ await _audit.Received().LogAsync(
+ "admin", "RetryArtifactDeployment", "SystemArtifact",
+ Arg.Any(), "retry-site", Arg.Any