fix(deployment-manager): resolve DeploymentManager-009,010,012,014 — shared deployment ID, lifecycle-timeout enforcement, doc/test cleanup; DeploymentManager-013 flagged

This commit is contained in:
Joseph Doherty
2026-05-16 22:14:23 -04:00
parent ff4a4bdeb7
commit e9ee4e3ea5
6 changed files with 355 additions and 25 deletions

View File

@@ -1,6 +1,11 @@
using System.Collections.Concurrent;
using System.Text.Json;
using Akka.Actor;
using Akka.TestKit.Xunit2;
using Microsoft.Extensions.Logging.Abstractions;
using Microsoft.Extensions.Options;
using NSubstitute;
using ScadaLink.Commons.Entities.Deployment;
using ScadaLink.Commons.Entities.Sites;
using ScadaLink.Commons.Interfaces.Repositories;
using ScadaLink.Commons.Interfaces.Services;
@@ -12,7 +17,7 @@ namespace ScadaLink.DeploymentManager.Tests;
/// <summary>
/// WP-7: Tests for system-wide artifact deployment.
/// </summary>
public class ArtifactDeploymentServiceTests
public class ArtifactDeploymentServiceTests : TestKit
{
private readonly ISiteRepository _siteRepo;
private readonly IDeploymentManagerRepository _deploymentRepo;
@@ -70,6 +75,86 @@ public class ArtifactDeploymentServiceTests
Assert.Equal(3, summary.SiteResults.Count);
}
// ── DeploymentManager-010: one logical deployment id across all per-site commands ──
[Fact]
public async Task DeployToAllSitesAsync_AllPerSiteCommandsShareTheSummaryDeploymentId()
{
// DeploymentManager-010: previously each per-site DeployArtifactsCommand
// minted its own GUID, so one logical deployment produced N+1 unrelated
// ids. Every per-site command must now carry the SAME id, equal to the
// id reported in the summary and audit log.
var sites = new List<Site>
{
new("Site One", "site-1") { Id = 1 },
new("Site Two", "site-2") { Id = 2 }
};
_siteRepo.GetAllSitesAsync(Arg.Any<CancellationToken>()).Returns(sites);
var probe = Sys.ActorOf(Props.Create(() => new ArtifactProbeActor()));
var service = CreateServiceWithCommActor(probe);
var result = await service.DeployToAllSitesAsync("admin");
Assert.True(result.IsSuccess);
var commands = ArtifactProbeActor.Received;
Assert.Equal(2, commands.Count);
// All per-site commands carry one shared id, equal to the summary id.
var distinctIds = commands.Select(c => c.DeploymentId).Distinct().ToList();
Assert.Single(distinctIds);
Assert.Equal(result.Value.DeploymentId, distinctIds[0]);
// The persisted record embeds the same logical deployment id.
await _deploymentRepo.Received().AddSystemArtifactDeploymentAsync(
Arg.Do<SystemArtifactDeploymentRecord>(r =>
{
using var doc = JsonDocument.Parse(r.PerSiteStatus!);
Assert.Equal(result.Value.DeploymentId,
doc.RootElement.GetProperty("DeploymentId").GetString());
}),
Arg.Any<CancellationToken>());
}
// ── DeploymentManager-014: real per-site success/failure coverage ──
[Fact]
public async Task DeployToAllSitesAsync_PartialFailure_ReportsPerSiteMatrix()
{
// Site one succeeds, site two fails -> the summary counts must reflect
// the per-site matrix.
var sites = new List<Site>
{
new("Site One", "ok-site") { Id = 1 },
new("Site Two", "fail-site") { Id = 2 }
};
_siteRepo.GetAllSitesAsync(Arg.Any<CancellationToken>()).Returns(sites);
var probe = Sys.ActorOf(Props.Create(() => new ArtifactProbeActor("fail-site")));
var service = CreateServiceWithCommActor(probe);
var result = await service.DeployToAllSitesAsync("admin");
Assert.True(result.IsSuccess);
Assert.Equal(1, result.Value.SuccessCount);
Assert.Equal(1, result.Value.FailureCount);
Assert.Contains(result.Value.SiteResults, r => r.SiteId == "ok-site" && r.Success);
Assert.Contains(result.Value.SiteResults, r => r.SiteId == "fail-site" && !r.Success);
}
[Fact]
public async Task RetryForSiteAsync_SiteSucceeds_ReturnsSuccessAndAudits()
{
var probe = Sys.ActorOf(Props.Create(() => new ArtifactProbeActor()));
var service = CreateServiceWithCommActor(probe);
var result = await service.RetryForSiteAsync(1, "retry-site", "admin");
Assert.True(result.IsSuccess);
Assert.Equal("retry-site", result.Value.SiteId);
await _audit.Received().LogAsync(
"admin", "RetryArtifactDeployment", "SystemArtifact",
Arg.Any<string>(), "retry-site", Arg.Any<object>(), Arg.Any<CancellationToken>());
}
private ArtifactDeploymentService CreateService()
{
var comms = new CommunicationService(
@@ -83,9 +168,51 @@ public class ArtifactDeploymentServiceTests
NullLogger<ArtifactDeploymentService>.Instance);
}
private static DeployArtifactsCommand CreateCommand()
private ArtifactDeploymentService CreateServiceWithCommActor(IActorRef commActor)
{
return new DeployArtifactsCommand(
"dep1", null, null, null, null, null, null, DateTimeOffset.UtcNow);
var comms = new CommunicationService(
Options.Create(new CommunicationOptions
{
ArtifactDeploymentTimeout = TimeSpan.FromSeconds(5)
}),
NullLogger<CommunicationService>.Instance);
comms.SetCommunicationActor(commActor);
return new ArtifactDeploymentService(
_siteRepo, _deploymentRepo, _templateRepo, _externalSystemRepo, _notificationRepo,
comms, _audit,
Options.Create(new DeploymentManagerOptions
{
ArtifactDeploymentTimeoutPerSite = TimeSpan.FromSeconds(5)
}),
NullLogger<ArtifactDeploymentService>.Instance);
}
/// <summary>
/// Stand-in CentralCommunicationActor for artifact deployment. Records every
/// <see cref="DeployArtifactsCommand"/> it receives and replies success
/// unless the target site id is in the configured failure set.
/// </summary>
private class ArtifactProbeActor : ReceiveActor
{
public static readonly ConcurrentBag<DeployArtifactsCommand> Received = new();
public ArtifactProbeActor(params string[] failingSites)
{
Received.Clear();
var failSet = new HashSet<string>(failingSites);
Receive<SiteEnvelope>(env =>
{
if (env.Message is DeployArtifactsCommand cmd)
{
Received.Add(cmd);
var success = !failSet.Contains(env.SiteId);
Sender.Tell(new ArtifactDeploymentResponse(
cmd.DeploymentId, env.SiteId, success,
success ? null : "site rejected artifacts", DateTimeOffset.UtcNow));
}
});
}
}
}

View File

@@ -763,6 +763,59 @@ public class DeploymentServiceTests : TestKit
Assert.Equal(1, ReconcileProbeActor.DeployCount);
}
// ── DeploymentManager-012: LifecycleCommandTimeout must actually bound lifecycle commands ──
[Fact]
public async Task DisableInstanceAsync_SiteUnresponsive_LifecycleCommandTimeoutBoundsTheWait()
{
// The site never replies to the DisableInstanceCommand. A short
// LifecycleCommandTimeout must abort the wait quickly -- if the option
// is dead code the call would instead hang until CommunicationOptions
// .LifecycleTimeout (much longer) elapses.
var instance = new Instance("StuckInst") { Id = 60, SiteId = 1, State = InstanceState.Enabled };
_repo.GetInstanceByIdAsync(60, Arg.Any<CancellationToken>()).Returns(instance);
// Probe drops every message -> no reply ever arrives.
var commActor = Sys.ActorOf(Props.Create(() => new SilentProbeActor()));
var comms = new CommunicationService(
Options.Create(new CommunicationOptions
{
// Long communication-layer timeout: if LifecycleCommandTimeout
// were dead, the test would wait this long.
LifecycleTimeout = TimeSpan.FromSeconds(30)
}),
NullLogger<CommunicationService>.Instance);
comms.SetCommunicationActor(commActor);
var siteRepo = Substitute.For<ISiteRepository>();
var service = new DeploymentService(
_repo, siteRepo, _pipeline, comms, _lockManager, _audit,
new DiffService(),
Options.Create(new DeploymentManagerOptions
{
OperationLockTimeout = TimeSpan.FromSeconds(5),
LifecycleCommandTimeout = TimeSpan.FromMilliseconds(300)
}),
NullLogger<DeploymentService>.Instance);
var sw = System.Diagnostics.Stopwatch.StartNew();
var result = await service.DisableInstanceAsync(60, "admin");
sw.Stop();
Assert.True(result.IsFailure);
// The 300ms LifecycleCommandTimeout bounded the wait well under the
// 30s communication-layer timeout.
Assert.True(sw.Elapsed < TimeSpan.FromSeconds(10),
$"Lifecycle command was not bounded by LifecycleCommandTimeout (took {sw.Elapsed}).");
}
/// <summary>Stand-in actor that never replies to anything.</summary>
private class SilentProbeActor : ReceiveActor
{
public SilentProbeActor() => ReceiveAny(_ => { });
}
// ── DeploymentManager-003: post-success persistence must commit the Success status ──
[Fact]