fix(deployment-manager): resolve DeploymentManager-009,010,012,014 — shared deployment ID, lifecycle-timeout enforcement, doc/test cleanup; DeploymentManager-013 flagged

This commit is contained in:
Joseph Doherty
2026-05-16 22:14:23 -04:00
parent ff4a4bdeb7
commit e9ee4e3ea5
6 changed files with 355 additions and 25 deletions

View File

@@ -58,9 +58,17 @@ public class ArtifactDeploymentService
/// Collects all artifact types from repositories and builds a <see cref="DeployArtifactsCommand"/>
/// scoped to a specific site's data connections.
/// </summary>
/// <param name="siteId">The DB id of the site whose data connections are collected.</param>
/// <param name="deploymentId">
/// DeploymentManager-010: the logical deployment id for this artifact deployment. All per-site
/// commands of one <see cref="DeployToAllSitesAsync"/> call share this id so the audit log,
/// UI summary, and persisted record correlate. When <c>null</c> a fresh id is minted (used by
/// single-site retries).
/// </param>
public async Task<DeployArtifactsCommand> BuildDeployArtifactsCommandAsync(
int siteId,
CancellationToken cancellationToken = default)
CancellationToken cancellationToken = default,
string? deploymentId = null)
{
var sharedScripts = await _templateRepo.GetAllSharedScriptsAsync(cancellationToken);
var externalSystems = await _externalSystemRepo.GetAllExternalSystemsAsync(cancellationToken);
@@ -111,7 +119,7 @@ public class ArtifactDeploymentService
smtp.Credentials, null, smtp.TlsMode)).ToList();
return new DeployArtifactsCommand(
Guid.NewGuid().ToString("N"),
deploymentId ?? Guid.NewGuid().ToString("N"),
scriptArtifacts,
externalSystemArtifacts,
dbConnectionArtifacts,
@@ -136,11 +144,15 @@ public class ArtifactDeploymentService
var deploymentId = Guid.NewGuid().ToString("N");
var perSiteResults = new Dictionary<string, SiteArtifactResult>();
// Build per-site commands sequentially (DbContext is not thread-safe)
// Build per-site commands sequentially (DbContext is not thread-safe).
// DeploymentManager-010: every per-site command carries the SAME logical
// deploymentId, so the per-site commands, audit log, persisted record,
// and UI summary all reference one id instead of N+1 unrelated GUIDs.
var siteCommands = new Dictionary<int, DeployArtifactsCommand>();
foreach (var site in sites)
{
siteCommands[site.Id] = await BuildDeployArtifactsCommandAsync(site.Id, cancellationToken);
siteCommands[site.Id] = await BuildDeployArtifactsCommandAsync(
site.Id, cancellationToken, deploymentId);
}
// Deploy to each site in parallel with per-site timeout
@@ -190,11 +202,20 @@ public class ArtifactDeploymentService
perSiteResults[result.SiteId] = result;
}
// Persist the system artifact deployment record
// Persist the system artifact deployment record.
// DeploymentManager-010: SystemArtifactDeploymentRecord has no dedicated
// DeploymentId column (adding one is a Commons/ConfigurationDatabase
// schema change outside this module). The logical deploymentId is
// embedded in the PerSiteStatus payload so the persisted record can be
// correlated with the audit log and UI summary that report the same id.
var record = new SystemArtifactDeploymentRecord("Artifacts", user)
{
DeployedAt = DateTimeOffset.UtcNow,
PerSiteStatus = JsonSerializer.Serialize(perSiteResults)
PerSiteStatus = JsonSerializer.Serialize(new
{
DeploymentId = deploymentId,
Sites = perSiteResults
})
};
await _deploymentRepo.AddSystemArtifactDeploymentAsync(record, cancellationToken);
await _deploymentRepo.SaveChangesAsync(cancellationToken);

View File

@@ -5,7 +5,11 @@ namespace ScadaLink.DeploymentManager;
/// </summary>
public class DeploymentManagerOptions
{
/// <summary>Timeout for lifecycle commands sent to sites (disable, enable, delete).</summary>
/// <summary>
/// WP-6: Timeout for a lifecycle command round-trip (disable, enable, delete).
/// Applied as a linked-CTS deadline in <c>DeploymentService</c> so a hung or
/// unreachable site does not hold the per-instance operation lock indefinitely.
/// </summary>
public TimeSpan LifecycleCommandTimeout { get; set; } = TimeSpan.FromSeconds(30);
/// <summary>WP-7: Timeout per site for system-wide artifact deployment.</summary>

View File

@@ -302,7 +302,21 @@ public class DeploymentService
var siteId = await ResolveSiteIdentifierAsync(instance.SiteId, cancellationToken);
var command = new DisableInstanceCommand(commandId, instance.UniqueName, DateTimeOffset.UtcNow);
var response = await _communicationService.DisableInstanceAsync(siteId, command, cancellationToken);
// WP-6: bound the round-trip with the configured lifecycle timeout so a
// hung/unreachable site does not block the operation lock indefinitely.
InstanceLifecycleResponse response;
try
{
using var cts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken);
cts.CancelAfter(_options.LifecycleCommandTimeout);
response = await _communicationService.DisableInstanceAsync(siteId, command, cts.Token);
}
catch (Exception ex) when (ex is TimeoutException or OperationCanceledException)
{
_logger.LogWarning(ex, "Disable of instance {Instance} timed out", instance.UniqueName);
return Result<InstanceLifecycleResponse>.Failure(
$"Disable failed: the site did not respond within {_options.LifecycleCommandTimeout}.");
}
if (response.Success)
{
@@ -343,7 +357,20 @@ public class DeploymentService
var siteId = await ResolveSiteIdentifierAsync(instance.SiteId, cancellationToken);
var command = new EnableInstanceCommand(commandId, instance.UniqueName, DateTimeOffset.UtcNow);
var response = await _communicationService.EnableInstanceAsync(siteId, command, cancellationToken);
// WP-6: bound the round-trip with the configured lifecycle timeout.
InstanceLifecycleResponse response;
try
{
using var cts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken);
cts.CancelAfter(_options.LifecycleCommandTimeout);
response = await _communicationService.EnableInstanceAsync(siteId, command, cts.Token);
}
catch (Exception ex) when (ex is TimeoutException or OperationCanceledException)
{
_logger.LogWarning(ex, "Enable of instance {Instance} timed out", instance.UniqueName);
return Result<InstanceLifecycleResponse>.Failure(
$"Enable failed: the site did not respond within {_options.LifecycleCommandTimeout}.");
}
if (response.Success)
{
@@ -365,7 +392,9 @@ public class DeploymentService
/// WP-6: Delete an instance. Stops the site actor, removes site config, and
/// removes the central instance record (deployment history, snapshot,
/// overrides, and connection bindings go with it). S&amp;F NOT cleared.
/// Delete fails if site unreachable (30s timeout via CommunicationOptions).
/// Delete fails if the site is unreachable within
/// <c>CommunicationOptions.LifecycleTimeout</c> (applied inside
/// <see cref="CommunicationService.DeleteInstanceAsync"/>).
/// </summary>
public async Task<Result<InstanceLifecycleResponse>> DeleteInstanceAsync(
int instanceId,
@@ -387,7 +416,20 @@ public class DeploymentService
var siteId = await ResolveSiteIdentifierAsync(instance.SiteId, cancellationToken);
var command = new DeleteInstanceCommand(commandId, instance.UniqueName, DateTimeOffset.UtcNow);
var response = await _communicationService.DeleteInstanceAsync(siteId, command, cancellationToken);
// WP-6: bound the round-trip with the configured lifecycle timeout.
InstanceLifecycleResponse response;
try
{
using var cts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken);
cts.CancelAfter(_options.LifecycleCommandTimeout);
response = await _communicationService.DeleteInstanceAsync(siteId, command, cts.Token);
}
catch (Exception ex) when (ex is TimeoutException or OperationCanceledException)
{
_logger.LogWarning(ex, "Delete of instance {Instance} timed out", instance.UniqueName);
return Result<InstanceLifecycleResponse>.Failure(
$"Delete failed: the site did not respond within {_options.LifecycleCommandTimeout}.");
}
if (response.Success)
{