test(host): deploy happy-path + idempotency integration tests (Task 59)

DeployHappyPathTests exercises the full deploy pipeline on the 2-node harness:
AdminOperationsActor → ConfigPublishCoordinator → DistributedPubSub →
DriverHostActor on both nodes → ApplyAck → coordinator seals. Verifies both
NodeDeploymentState rows reach Applied and Deployment.Status reaches Sealed.

Exposed + fixed two production bugs along the way:

1. Coordinator was publishing DispatchDeployment on the "deployments" topic but
   never subscribed to anything — DriverHostActor ACKs published on the same
   topic could not reach it. Added dedicated "deployment-acks" topic with
   coordinator subscription in PreStart, and DriverHostActor publishes ACKs
   there.

2. NodeId derivation used member.Address.Host only — two cluster members on a
   shared loopback host (test harness, dev VMs) collided to one identity. The
   coordinator's expected-ack set became {1} and the system sealed after only
   half the nodes acked. Switched to host:port everywhere (ClusterRoleInfo +
   coordinator) so loopback nodes stay distinct and production identities are
   harmlessly more specific.

Tests: 95 v2 tests pass (was 93 + 2 deploy tests), 0 skipped.

Failover scenarios (design §8 cases 3-7: node-kill-mid-apply, split-brain,
restart-during-deploy) deferred — they need controlled node-down primitives
on the harness. Tracked as F22 (failover scenario test cases).
This commit is contained in:
Joseph Doherty
2026-05-26 06:34:36 -04:00
parent 62e3cd6599
commit 5cfbe8b5dd
5 changed files with 158 additions and 18 deletions

View File

@@ -29,7 +29,10 @@ public sealed class ClusterRoleInfo : IClusterRoleInfo, IDisposable
{
_cluster = Akka.Cluster.Cluster.Get(system);
_logger = logger;
_localNode = CommonsNodeId.Parse(options.Value.PublicHostname);
// NodeId encodes host:port so cluster members on shared hosts (test loopback, dev VMs
// sharing a bind IP) stay distinct. Production hosts have unique DNS names so the port
// suffix is harmless redundancy.
_localNode = CommonsNodeId.Parse($"{options.Value.PublicHostname}:{options.Value.Port}");
_localRoles = new HashSet<string>(options.Value.Roles, StringComparer.Ordinal);
SeedFromCurrentState();
@@ -48,7 +51,7 @@ public sealed class ClusterRoleInfo : IClusterRoleInfo, IDisposable
{
if (!_membersByRole.TryGetValue(role, out var members)) return Array.Empty<CommonsNodeId>();
return members
.Select(m => CommonsNodeId.Parse(m.Address.Host ?? string.Empty))
.Select(m => ToNodeId(m.Address))
.ToArray();
}
}
@@ -58,7 +61,7 @@ public sealed class ClusterRoleInfo : IClusterRoleInfo, IDisposable
lock (_lock)
{
return _roleLeaders.TryGetValue(role, out var leader) && leader is not null
? CommonsNodeId.Parse(leader.Address.Host ?? string.Empty)
? ToNodeId(leader.Address)
: (CommonsNodeId?)null;
}
}
@@ -121,7 +124,7 @@ public sealed class ClusterRoleInfo : IClusterRoleInfo, IDisposable
{
_roleLeaders.TryGetValue(evt.Role, out var prevMember);
if (prevMember is not null)
previous = CommonsNodeId.Parse(prevMember.Address.Host ?? string.Empty);
previous = ToNodeId(prevMember.Address);
var nextMember = evt.Leader is null
? null
@@ -129,7 +132,7 @@ public sealed class ClusterRoleInfo : IClusterRoleInfo, IDisposable
_roleLeaders[evt.Role] = nextMember;
if (nextMember is not null)
next = CommonsNodeId.Parse(nextMember.Address.Host ?? string.Empty);
next = ToNodeId(nextMember.Address);
raise = !Nullable.Equals(previous, next);
}
@@ -150,6 +153,9 @@ public sealed class ClusterRoleInfo : IClusterRoleInfo, IDisposable
}
}
private static CommonsNodeId ToNodeId(Akka.Actor.Address address) =>
CommonsNodeId.Parse($"{address.Host ?? string.Empty}:{address.Port ?? 0}");
public void Dispose()
{
_subscriber?.Tell(PoisonPill.Instance);

View File

@@ -24,6 +24,7 @@ namespace ZB.MOM.WW.OtOpcUa.ControlPlane.Coordinators;
public sealed class ConfigPublishCoordinator : ReceiveActor, IWithTimers
{
public const string DeploymentsTopic = "deployments";
public const string DeploymentAcksTopic = "deployment-acks";
public static readonly TimeSpan DefaultApplyDeadline = TimeSpan.FromMinutes(2);
private readonly IDbContextFactory<OtOpcUaConfigDbContext> _dbFactory;
@@ -50,6 +51,7 @@ public sealed class ConfigPublishCoordinator : ReceiveActor, IWithTimers
Receive<DispatchDeployment>(HandleDispatch);
Receive<ApplyAck>(HandleAck);
Receive<DeadlineElapsed>(HandleDeadline);
Receive<SubscribeAck>(_ => { /* DPS subscribe confirmation */ });
}
/// <summary>
@@ -59,6 +61,10 @@ public sealed class ConfigPublishCoordinator : ReceiveActor, IWithTimers
/// </summary>
protected override void PreStart()
{
// Subscribe to per-node ApplyAck broadcasts so DriverHostActors on remote members can
// route their ACKs to whichever node currently hosts this singleton.
DistributedPubSub.Get(Context.System).Mediator.Tell(new Subscribe(DeploymentAcksTopic, Self));
using var db = _dbFactory.CreateDbContext();
var inflight = db.Deployments
.Where(d => d.Status == DeploymentStatus.Dispatching || d.Status == DeploymentStatus.AwaitingApplyAcks)
@@ -239,7 +245,9 @@ public sealed class ConfigPublishCoordinator : ReceiveActor, IWithTimers
if (!member.Roles.Contains("driver")) continue;
var host = member.Address.Host;
if (string.IsNullOrWhiteSpace(host)) continue;
nodes.Add(NodeId.Parse(host));
// Match ClusterRoleInfo's NodeId derivation (host:port) so DriverHostActor's
// self-identification and the coordinator's expected-ack set agree.
nodes.Add(NodeId.Parse($"{host}:{member.Address.Port ?? 0}"));
}
return nodes;
}

View File

@@ -30,6 +30,7 @@ namespace ZB.MOM.WW.OtOpcUa.Runtime.Drivers;
public sealed class DriverHostActor : ReceiveActor, IWithTimers
{
public const string DeploymentsTopic = "deployments";
public const string DeploymentAcksTopic = "deployment-acks";
public static readonly TimeSpan ReconnectInterval = TimeSpan.FromSeconds(30);
private readonly IDbContextFactory<OtOpcUaConfigDbContext> _dbFactory;
@@ -276,9 +277,10 @@ public sealed class DriverHostActor : ReceiveActor, IWithTimers
}
else
{
// No direct coordinator handle — publish back through DistributedPubSub so the
// singleton routes it. The coordinator subscribes to its own incoming topic.
DistributedPubSub.Get(Context.System).Mediator.Tell(new Publish(DeploymentsTopic, ack));
// No direct coordinator handle — publish on the dedicated ACK topic. The coordinator
// singleton subscribes there in PreStart so the ACK reaches whichever admin node hosts
// it without an actor-path lookup.
DistributedPubSub.Get(Context.System).Mediator.Tell(new Publish(DeploymentAcksTopic, ack));
}
}
}