9ab1c00265
Fixes the 8 findings from the 2026-06-24 re-review (commit c42bb485), with a
regression test per Medium finding:
- DataConnectionLayer-029 (Med): HandleAlarmSubscribeCompleted now mirrors the
tag-path re-check — if a feed is already stored for the source, release the
redundant just-created subscription instead of overwriting + leaking the first
one (the double-subscribe window DCL-023 reopened). +regression test.
- InboundAPI-031 (Med): remove WaitForAttribute's local 5s grace backstop (tighter
than the CommunicationService Ask's timeout+IntegrationTimeout round-trip budget,
so a slow-but-valid timed-out 'false' got cancelled into a 500). Link only the
client-abort + explicit caller tokens; the lower layer owns the backstop. +test.
- SiteRuntime-032 (Med): derive the deployed count from an authoritative set of
deployed config names (HashSet) instead of a map-presence-gated int, so deleting
a DISABLED instance decrements correctly (SiteRuntime-029's gate leaked it).
+deploy->disable->delete regression test.
- StoreAndForward-028 (Med): reset _bufferedCount in StopAsync alongside the
register-guard so a same-instance Stop->Start re-seeds from a clean base (no ~2N
gauge double-count). +restart regression test.
- AuditLog-017 (Low): test the OnIngestAsync scope-resolution guard (actor survives,
replies empty, counts the failure) — no longer unpinned.
- CentralUI-037 / ScriptAnalysis-009 / SiteRuntime-033 (Low): doc-comment + spec
fixes (Database-throws in the inbound sandbox; baseReferences param wording;
native-alarm cap return-to-normal + per-condition NativeAlarmDropped eviction).
Targeted suites green: SiteRuntime 5, StoreAndForward 6, InboundAPI 31,
DataConnectionLayer 10, AuditLog 5, ScriptAnalysis 40, CentralUI ScriptAnalysis 52.
315 lines
16 KiB
C#
315 lines
16 KiB
C#
using Akka.Actor;
|
|
using Akka.TestKit.Xunit2;
|
|
using Microsoft.Extensions.Logging.Abstractions;
|
|
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Deployment;
|
|
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Health;
|
|
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Lifecycle;
|
|
using ZB.MOM.WW.ScadaBridge.Commons.Types.Enums;
|
|
using ZB.MOM.WW.ScadaBridge.Commons.Types.Flattening;
|
|
using ZB.MOM.WW.ScadaBridge.HealthMonitoring;
|
|
using ZB.MOM.WW.ScadaBridge.SiteRuntime.Actors;
|
|
using ZB.MOM.WW.ScadaBridge.SiteRuntime.Persistence;
|
|
using ZB.MOM.WW.ScadaBridge.SiteRuntime.Scripts;
|
|
using System.Text.Json;
|
|
|
|
namespace ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests.Actors;
|
|
|
|
/// <summary>
|
|
/// Regression tests for SiteRuntime-003: redeployment of an existing instance must
|
|
/// wait for the terminating Instance Actor before recreating the child, instead of
|
|
/// relying on a fixed 500 ms reschedule that can collide on the child actor name.
|
|
/// </summary>
|
|
public class DeploymentManagerRedeployTests : TestKit, IDisposable
|
|
{
|
|
private readonly SiteStorageService _storage;
|
|
private readonly ScriptCompilationService _compilationService;
|
|
private readonly SharedScriptLibrary _sharedScriptLibrary;
|
|
private readonly string _dbFile;
|
|
|
|
public DeploymentManagerRedeployTests()
|
|
{
|
|
_dbFile = Path.Combine(Path.GetTempPath(), $"dm-redeploy-test-{Guid.NewGuid():N}.db");
|
|
_storage = new SiteStorageService(
|
|
$"Data Source={_dbFile}",
|
|
NullLogger<SiteStorageService>.Instance);
|
|
_storage.InitializeAsync().GetAwaiter().GetResult();
|
|
_compilationService = new ScriptCompilationService(
|
|
NullLogger<ScriptCompilationService>.Instance);
|
|
_sharedScriptLibrary = new SharedScriptLibrary(
|
|
_compilationService, NullLogger<SharedScriptLibrary>.Instance);
|
|
}
|
|
|
|
void IDisposable.Dispose()
|
|
{
|
|
Shutdown();
|
|
try { File.Delete(_dbFile); } catch { /* cleanup */ }
|
|
}
|
|
|
|
private IActorRef CreateDeploymentManager(ISiteHealthCollector? healthCollector = null)
|
|
{
|
|
return ActorOf(Props.Create(() => new DeploymentManagerActor(
|
|
_storage,
|
|
_compilationService,
|
|
_sharedScriptLibrary,
|
|
null,
|
|
new SiteRuntimeOptions(),
|
|
NullLogger<DeploymentManagerActor>.Instance,
|
|
null,
|
|
null,
|
|
healthCollector,
|
|
null)));
|
|
}
|
|
|
|
/// <summary>
|
|
/// Minimal fake that records the most recent deployed-instance count.
|
|
/// </summary>
|
|
private sealed class CountCapturingHealthCollector : ISiteHealthCollector
|
|
{
|
|
public int LastDeployedCount { get; private set; }
|
|
public void IncrementScriptError() { }
|
|
public void IncrementAlarmError() { }
|
|
public void IncrementDeadLetter() { }
|
|
public void IncrementSiteAuditWriteFailures() { }
|
|
public void IncrementAuditRedactionFailure() { }
|
|
public void UpdateSiteAuditBacklog(ZB.MOM.WW.ScadaBridge.Commons.Types.SiteAuditBacklogSnapshot snapshot) { }
|
|
public void UpdateConnectionHealth(string connectionName, ConnectionHealth health) { }
|
|
public void RemoveConnection(string connectionName) { }
|
|
public void UpdateTagResolution(string connectionName, int totalSubscribed, int successfullyResolved) { }
|
|
public void UpdateConnectionEndpoint(string connectionName, string endpoint) { }
|
|
public void UpdateTagQuality(string connectionName, int good, int bad, int uncertain) { }
|
|
public void SetStoreAndForwardDepths(IReadOnlyDictionary<string, int> depths) { }
|
|
public void SetInstanceCounts(int deployed, int enabled, int disabled) => LastDeployedCount = deployed;
|
|
public void SetParkedMessageCount(int count) { }
|
|
public void SetNodeHostname(string hostname) { }
|
|
public void SetClusterNodes(IReadOnlyList<NodeStatus> nodes) { }
|
|
public void SetActiveNode(bool isActive) { }
|
|
public bool IsActiveNode => true;
|
|
public SiteHealthReport CollectReport(string siteId) => throw new NotSupportedException();
|
|
}
|
|
|
|
private static string MakeConfigJson(string instanceName)
|
|
{
|
|
var config = new FlattenedConfiguration
|
|
{
|
|
InstanceUniqueName = instanceName,
|
|
Attributes =
|
|
[
|
|
new ResolvedAttribute { CanonicalName = "TestAttr", Value = "1", DataType = "Int32" }
|
|
]
|
|
};
|
|
return JsonSerializer.Serialize(config);
|
|
}
|
|
|
|
[Fact]
|
|
public async Task Redeploy_ExistingInstance_SucceedsWithoutNameCollision()
|
|
{
|
|
var actor = CreateDeploymentManager();
|
|
await Task.Delay(500); // empty startup
|
|
|
|
// Initial deploy.
|
|
actor.Tell(new DeployInstanceCommand(
|
|
"dep-1", "RedeployPump", "h1", MakeConfigJson("RedeployPump"), "admin", DateTimeOffset.UtcNow));
|
|
var first = ExpectMsg<DeploymentStatusResponse>(TimeSpan.FromSeconds(5));
|
|
Assert.Equal(DeploymentStatus.Success, first.Status);
|
|
await Task.Delay(500);
|
|
|
|
// Redeploy the same instance — must replace the existing actor cleanly.
|
|
actor.Tell(new DeployInstanceCommand(
|
|
"dep-2", "RedeployPump", "h2", MakeConfigJson("RedeployPump"), "admin", DateTimeOffset.UtcNow));
|
|
var second = ExpectMsg<DeploymentStatusResponse>(TimeSpan.FromSeconds(10));
|
|
Assert.Equal(DeploymentStatus.Success, second.Status);
|
|
|
|
// The redeployed instance must still be operable (no orphaned/broken actor).
|
|
actor.Tell(new DisableInstanceCommand("cmd-1", "RedeployPump", DateTimeOffset.UtcNow));
|
|
var disable = ExpectMsg<InstanceLifecycleResponse>(TimeSpan.FromSeconds(5));
|
|
Assert.True(disable.Success);
|
|
}
|
|
|
|
[Fact]
|
|
public async Task SR020_ThreeRapidDeploys_DoNotThrowInvalidActorNameException_LatestWins()
|
|
{
|
|
// Regression test for SiteRuntime-020. The previous implementation tracked
|
|
// pending redeploys by IActorRef (_pendingRedeploys) but had no
|
|
// name-keyed shadow, so a third DeployInstanceCommand arriving WHILE the
|
|
// first redeploy's predecessor was still terminating saw
|
|
// _instanceActors.TryGetValue==false and fell through to
|
|
// ApplyDeployment → CreateInstanceActor → Context.ActorOf, which threw
|
|
// InvalidActorNameException because the child name was still registered
|
|
// until Terminated fires. The supervisor's Stop directive then silently
|
|
// dropped the deploy, leaving the deployer waiting forever and the
|
|
// persistence Task.Run dangling. After the fix, _terminatingActorsByName
|
|
// tracks the in-flight terminator by name; the third deploy overwrites
|
|
// the buffered pending command (last-write-wins) and tells the displaced
|
|
// sender it was superseded.
|
|
var actor = CreateDeploymentManager();
|
|
await Task.Delay(500);
|
|
|
|
// Initial deploy — establishes the running instance.
|
|
actor.Tell(new DeployInstanceCommand(
|
|
"dep-1", "RapidPump", "h1", MakeConfigJson("RapidPump"), "admin", DateTimeOffset.UtcNow));
|
|
var first = ExpectMsg<DeploymentStatusResponse>(TimeSpan.FromSeconds(5));
|
|
Assert.Equal(DeploymentStatus.Success, first.Status);
|
|
await Task.Delay(200);
|
|
|
|
// Two rapid redeploys before the predecessor has time to fully terminate.
|
|
// The second deploy stops the actor (watching it) and buffers itself.
|
|
// The third deploy arrives almost immediately and must NOT crash — it
|
|
// overwrites the buffered pending command and tells dep-2 it was superseded.
|
|
var probe2 = CreateTestProbe();
|
|
var probe3 = CreateTestProbe();
|
|
|
|
actor.Tell(new DeployInstanceCommand(
|
|
"dep-2", "RapidPump", "h2", MakeConfigJson("RapidPump"), "admin", DateTimeOffset.UtcNow),
|
|
probe2.Ref);
|
|
actor.Tell(new DeployInstanceCommand(
|
|
"dep-3", "RapidPump", "h3", MakeConfigJson("RapidPump"), "admin", DateTimeOffset.UtcNow),
|
|
probe3.Ref);
|
|
|
|
// dep-2 must be told it was superseded; dep-3 must succeed once the
|
|
// predecessor finishes terminating.
|
|
var superseded = probe2.ExpectMsg<DeploymentStatusResponse>(TimeSpan.FromSeconds(10));
|
|
Assert.Equal("dep-2", superseded.DeploymentId);
|
|
Assert.Equal(DeploymentStatus.Failed, superseded.Status);
|
|
Assert.NotNull(superseded.ErrorMessage);
|
|
Assert.Contains("superseded", superseded.ErrorMessage!, StringComparison.OrdinalIgnoreCase);
|
|
|
|
var winner = probe3.ExpectMsg<DeploymentStatusResponse>(TimeSpan.FromSeconds(10));
|
|
Assert.Equal("dep-3", winner.DeploymentId);
|
|
Assert.Equal(DeploymentStatus.Success, winner.Status);
|
|
|
|
// The instance must still be operable — proves no orphaned actor / no
|
|
// half-created child holding the name.
|
|
actor.Tell(new DisableInstanceCommand("cmd-1", "RapidPump", DateTimeOffset.UtcNow));
|
|
var disable = ExpectMsg<InstanceLifecycleResponse>(TimeSpan.FromSeconds(5));
|
|
Assert.True(disable.Success);
|
|
}
|
|
|
|
[Fact]
|
|
public async Task SR029_DeleteDuringPendingRedeploy_InstanceStaysDeleted_AndCounterIsCorrect()
|
|
{
|
|
// Regression test for SiteRuntime-029. A delete arriving WHILE a redeploy is
|
|
// still terminating used to: (1) over-decrement _totalDeployedCount, and
|
|
// (2) leave the buffered _pendingRedeploys entry intact — so when Terminated
|
|
// fired, HandleTerminated called ApplyDeployment(isRedeploy: true) and
|
|
// RESURRECTED the just-deleted instance (re-creating the actor and re-writing
|
|
// the deployed-config SQLite row). After the fix, HandleDelete is authoritative
|
|
// over the mid-redeploy bookkeeping: it cancels the pending redeploy (telling
|
|
// the displaced deployer it was superseded), clears the terminating shadow, and
|
|
// decrements the counter exactly once.
|
|
var health = new CountCapturingHealthCollector();
|
|
var actor = CreateDeploymentManager(health);
|
|
await Task.Delay(500);
|
|
|
|
// Establish the running instance.
|
|
actor.Tell(new DeployInstanceCommand(
|
|
"dep-1", "RaceTarget", "h1", MakeConfigJson("RaceTarget"), "admin", DateTimeOffset.UtcNow));
|
|
var first = ExpectMsg<DeploymentStatusResponse>(TimeSpan.FromSeconds(5));
|
|
Assert.Equal(DeploymentStatus.Success, first.Status);
|
|
await Task.Delay(300);
|
|
|
|
// Fire a redeploy immediately followed by a delete. Both queue on the
|
|
// singleton mailbox: HandleDeploy runs first (removes from _instanceActors,
|
|
// watches + stops the predecessor, buffers the redeploy, sets the terminating
|
|
// shadow), then HandleDelete runs while the predecessor is still terminating
|
|
// (Terminated has not fired) — exactly the SiteRuntime-029 window.
|
|
var redeployProbe = CreateTestProbe();
|
|
actor.Tell(new DeployInstanceCommand(
|
|
"dep-2", "RaceTarget", "h2", MakeConfigJson("RaceTarget"), "admin", DateTimeOffset.UtcNow),
|
|
redeployProbe.Ref);
|
|
actor.Tell(new DeleteInstanceCommand("del-1", "RaceTarget", DateTimeOffset.UtcNow));
|
|
|
|
// The delete succeeds...
|
|
var delete = ExpectMsg<InstanceLifecycleResponse>(TimeSpan.FromSeconds(10));
|
|
Assert.True(delete.Success);
|
|
|
|
// ...and the displaced redeploy is told it was superseded (not silently lost).
|
|
var superseded = redeployProbe.ExpectMsg<DeploymentStatusResponse>(TimeSpan.FromSeconds(10));
|
|
Assert.Equal("dep-2", superseded.DeploymentId);
|
|
Assert.Equal(DeploymentStatus.Failed, superseded.Status);
|
|
Assert.Contains("superseded", superseded.ErrorMessage!, StringComparison.OrdinalIgnoreCase);
|
|
|
|
// Give the predecessor's Terminated signal time to fire — it must NOT
|
|
// resurrect the deleted instance.
|
|
await Task.Delay(1000);
|
|
|
|
// The instance stays deleted: no deployed-config row remains.
|
|
var configs = await _storage.GetAllDeployedConfigsAsync();
|
|
Assert.DoesNotContain(configs, c => c.InstanceUniqueName == "RaceTarget");
|
|
|
|
// The deployed count is back to 0 — neither over-decremented nor resurrected.
|
|
Assert.Equal(0, health.LastDeployedCount);
|
|
}
|
|
|
|
[Fact]
|
|
public async Task SR032_DeleteDisabledInstance_DecrementsDeployedCount()
|
|
{
|
|
// Regression test for SiteRuntime-032. SiteRuntime-029 gated the deployed-count
|
|
// decrement on the instance being present in _instanceActors OR mid-redeploy in
|
|
// _terminatingActorsByName. A DISABLED instance is in NEITHER map (disable removes
|
|
// it from _instanceActors and never adds it to the terminating shadow) yet still has
|
|
// a deployed-config row counted as deployed — so deleting a disabled instance
|
|
// skipped the decrement and leaked the deployed/disabled tally on the health
|
|
// dashboard. After the fix the count is derived from the authoritative set of
|
|
// deployed config names, so a delete decrements for a disabled instance too.
|
|
var health = new CountCapturingHealthCollector();
|
|
var actor = CreateDeploymentManager(health);
|
|
await Task.Delay(500);
|
|
|
|
// Deploy → deployed count 1.
|
|
actor.Tell(new DeployInstanceCommand(
|
|
"dep-1", "DisablePump", "h1", MakeConfigJson("DisablePump"), "admin", DateTimeOffset.UtcNow));
|
|
var deploy = ExpectMsg<DeploymentStatusResponse>(TimeSpan.FromSeconds(5));
|
|
Assert.Equal(DeploymentStatus.Success, deploy.Status);
|
|
await Task.Delay(300);
|
|
Assert.Equal(1, health.LastDeployedCount);
|
|
|
|
// Disable → the instance is still deployed (count stays 1), just not enabled.
|
|
actor.Tell(new DisableInstanceCommand("cmd-1", "DisablePump", DateTimeOffset.UtcNow));
|
|
var disable = ExpectMsg<InstanceLifecycleResponse>(TimeSpan.FromSeconds(5));
|
|
Assert.True(disable.Success);
|
|
Assert.Equal(1, health.LastDeployedCount);
|
|
|
|
// Delete the DISABLED instance → the deployed count must return to 0.
|
|
// (The SiteRuntime-029 regression left it stuck at 1.)
|
|
actor.Tell(new DeleteInstanceCommand("del-1", "DisablePump", DateTimeOffset.UtcNow));
|
|
var delete = ExpectMsg<InstanceLifecycleResponse>(TimeSpan.FromSeconds(5));
|
|
Assert.True(delete.Success);
|
|
Assert.Equal(0, health.LastDeployedCount);
|
|
|
|
// No deployed-config row remains.
|
|
var configs = await _storage.GetAllDeployedConfigsAsync();
|
|
Assert.DoesNotContain(configs, c => c.InstanceUniqueName == "DisablePump");
|
|
}
|
|
|
|
[Fact]
|
|
public async Task Redeploy_ExistingInstance_DoesNotOverCountDeployedInstances()
|
|
{
|
|
var health = new CountCapturingHealthCollector();
|
|
var actor = CreateDeploymentManager(health);
|
|
await Task.Delay(500);
|
|
|
|
// Deploy once.
|
|
actor.Tell(new DeployInstanceCommand(
|
|
"dep-1", "CountPump", "h1", MakeConfigJson("CountPump"), "admin", DateTimeOffset.UtcNow));
|
|
ExpectMsg<DeploymentStatusResponse>(TimeSpan.FromSeconds(5));
|
|
await Task.Delay(500);
|
|
|
|
// Redeploy several times.
|
|
for (var i = 2; i <= 4; i++)
|
|
{
|
|
actor.Tell(new DeployInstanceCommand(
|
|
$"dep-{i}", "CountPump", $"h{i}", MakeConfigJson("CountPump"), "admin", DateTimeOffset.UtcNow));
|
|
ExpectMsg<DeploymentStatusResponse>(TimeSpan.FromSeconds(10));
|
|
await Task.Delay(500);
|
|
}
|
|
|
|
// Storage uses UPSERT — exactly one deployed config row should exist.
|
|
var configs = await _storage.GetAllDeployedConfigsAsync();
|
|
Assert.Single(configs, c => c.InstanceUniqueName == "CountPump");
|
|
|
|
// The reported deployed count must be exactly 1 — a redeploy is an update,
|
|
// not a new instance, so the in-memory counter must not drift upward.
|
|
Assert.Equal(1, health.LastDeployedCount);
|
|
}
|
|
}
|