fix(runtime): restart driver no longer throws 'actor name is not unique'
v2-ci / build (push) Failing after 42s
v2-ci / unit-tests (tests/Core/ZB.MOM.WW.OtOpcUa.Cluster.Tests) (push) Has been skipped
v2-ci / unit-tests (tests/Server/ZB.MOM.WW.OtOpcUa.ControlPlane.Tests) (push) Has been skipped
v2-ci / unit-tests (tests/Server/ZB.MOM.WW.OtOpcUa.OpcUaServer.Tests) (push) Has been skipped
v2-ci / unit-tests (tests/Server/ZB.MOM.WW.OtOpcUa.Runtime.Tests) (push) Has been skipped
v2-ci / unit-tests (tests/Server/ZB.MOM.WW.OtOpcUa.Security.Tests) (push) Has been skipped
v2-ci / integration (tests/Server/ZB.MOM.WW.OtOpcUa.Host.IntegrationTests) (push) Has been skipped
v2-ci / integration (tests/Server/ZB.MOM.WW.OtOpcUa.OpcUaServer.IntegrationTests) (push) Has been skipped
v2-ci / build (push) Failing after 42s
v2-ci / unit-tests (tests/Core/ZB.MOM.WW.OtOpcUa.Cluster.Tests) (push) Has been skipped
v2-ci / unit-tests (tests/Server/ZB.MOM.WW.OtOpcUa.ControlPlane.Tests) (push) Has been skipped
v2-ci / unit-tests (tests/Server/ZB.MOM.WW.OtOpcUa.OpcUaServer.Tests) (push) Has been skipped
v2-ci / unit-tests (tests/Server/ZB.MOM.WW.OtOpcUa.Runtime.Tests) (push) Has been skipped
v2-ci / unit-tests (tests/Server/ZB.MOM.WW.OtOpcUa.Security.Tests) (push) Has been skipped
v2-ci / integration (tests/Server/ZB.MOM.WW.OtOpcUa.Host.IntegrationTests) (push) Has been skipped
v2-ci / integration (tests/Server/ZB.MOM.WW.OtOpcUa.OpcUaServer.IntegrationTests) (push) Has been skipped
HandleRestartDriver stopped + respawned the child within one synchronous message handler, reusing the base actor name drv-<id>. Context.Stop is async (the child processes its own stop on its own mailbox), so the old child was ALWAYS still registered when the respawn ran — Context.ActorOf threw InvalidActorNameException deterministically on every AdminUI Restart press, crashing + restarting the host. Fix: a monotonic _childSpawnGeneration counter (single-threaded actor) feeds a -g<gen> suffix on every spawned child name, so a respawn can never collide with the still-terminating predecessor. Children are tracked by the _children dict (by IActorRef), never by actor path, so the suffix is invisible to callers. This also closes the same-shaped latent race in the reconcile path (a removed- then-readded instance, and a driver-type-change ToStop+ToSpawn in one plan). Regression test RestartDriver_respawns_the_child_without_an_actor_name_collision (verified: FAILS on the old code with the exact InvalidActorNameException, PASSES with the fix). Runtime.Tests 238/238 green. Code-reviewed (approved).
This commit is contained in:
@@ -88,6 +88,11 @@ public sealed class DriverHostActor : ReceiveActor, IWithTimers
|
|||||||
|
|
||||||
private readonly Dictionary<string, ChildEntry> _children = new(StringComparer.Ordinal);
|
private readonly Dictionary<string, ChildEntry> _children = new(StringComparer.Ordinal);
|
||||||
|
|
||||||
|
// Monotonic counter feeding the child actor-name suffix (see ActorNameFor / SpawnChild). Single-
|
||||||
|
// threaded actor, so a plain increment is safe; it only ever grows, guaranteeing a unique name per
|
||||||
|
// spawn so a restart's respawn never collides with the still-terminating old child.
|
||||||
|
private long _childSpawnGeneration;
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Driver live-value routing map: <c>(DriverInstanceId, FullName) → folder-scoped equipment
|
/// Driver live-value routing map: <c>(DriverInstanceId, FullName) → folder-scoped equipment
|
||||||
/// NodeId(s)</c>. Rebuilt every apply by <see cref="PushDesiredSubscriptions"/> from the
|
/// NodeId(s)</c>. Rebuilt every apply by <see cref="PushDesiredSubscriptions"/> from the
|
||||||
@@ -984,6 +989,13 @@ public sealed class DriverHostActor : ReceiveActor, IWithTimers
|
|||||||
// identity for pre-PR artifacts that don't carry it yet (older deploys persisted before
|
// identity for pre-PR artifacts that don't carry it yet (older deploys persisted before
|
||||||
// ClusterId was added to DriverInstanceSpec).
|
// ClusterId was added to DriverInstanceSpec).
|
||||||
var clusterId = !string.IsNullOrEmpty(spec.ClusterId) ? spec.ClusterId : _localNode.Value;
|
var clusterId = !string.IsNullOrEmpty(spec.ClusterId) ? spec.ClusterId : _localNode.Value;
|
||||||
|
// A fresh generation-suffixed name on EVERY spawn so a respawn can never collide with a child
|
||||||
|
// still tearing down: Akka frees a stopped actor's name only after it FULLY terminates (async),
|
||||||
|
// but HandleRestartDriver stops + respawns within one (synchronous) message handler — the old
|
||||||
|
// child is still registered, so reusing the base name throws InvalidActorNameException
|
||||||
|
// ("actor name is not unique"). Children are tracked by the _children dict (by IActorRef), never
|
||||||
|
// by path, so the suffix is invisible to every caller.
|
||||||
|
var actorName = ActorNameFor(spec.DriverInstanceId, _childSpawnGeneration++);
|
||||||
if (stub)
|
if (stub)
|
||||||
{
|
{
|
||||||
child = Context.ActorOf(
|
child = Context.ActorOf(
|
||||||
@@ -993,7 +1005,7 @@ public sealed class DriverHostActor : ReceiveActor, IWithTimers
|
|||||||
startStubbed: true,
|
startStubbed: true,
|
||||||
healthPublisher: _healthPublisher,
|
healthPublisher: _healthPublisher,
|
||||||
clusterId: clusterId),
|
clusterId: clusterId),
|
||||||
ActorNameFor(spec.DriverInstanceId));
|
actorName);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
@@ -1002,7 +1014,7 @@ public sealed class DriverHostActor : ReceiveActor, IWithTimers
|
|||||||
driver!,
|
driver!,
|
||||||
healthPublisher: _healthPublisher,
|
healthPublisher: _healthPublisher,
|
||||||
clusterId: clusterId),
|
clusterId: clusterId),
|
||||||
ActorNameFor(spec.DriverInstanceId));
|
actorName);
|
||||||
child.Tell(new DriverInstanceActor.InitializeRequested(spec.DriverConfig));
|
child.Tell(new DriverInstanceActor.InitializeRequested(spec.DriverConfig));
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1028,11 +1040,13 @@ public sealed class DriverHostActor : ReceiveActor, IWithTimers
|
|||||||
_log.Info("DriverHost {Node}: stopped driver child {Id}", _localNode, driverInstanceId);
|
_log.Info("DriverHost {Node}: stopped driver child {Id}", _localNode, driverInstanceId);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static string ActorNameFor(string driverInstanceId)
|
private static string ActorNameFor(string driverInstanceId, long generation)
|
||||||
{
|
{
|
||||||
// Akka actor names cannot contain '/', ':', or whitespace. Mangle defensively.
|
// Akka actor names cannot contain '/', ':', or whitespace. Mangle defensively. The monotonic
|
||||||
|
// generation suffix guarantees a never-before-used name on every spawn (see SpawnChild) so a
|
||||||
|
// restart's respawn can never collide with the still-terminating predecessor.
|
||||||
var chars = driverInstanceId.Select(c => char.IsLetterOrDigit(c) || c is '-' or '_' or '.' ? c : '_').ToArray();
|
var chars = driverInstanceId.Select(c => char.IsLetterOrDigit(c) || c is '-' or '_' or '.' ? c : '_').ToArray();
|
||||||
return "drv-" + new string(chars);
|
return "drv-" + new string(chars) + "-g" + generation;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
|
|||||||
@@ -3,6 +3,7 @@ using Akka.Actor;
|
|||||||
using Microsoft.EntityFrameworkCore;
|
using Microsoft.EntityFrameworkCore;
|
||||||
using Shouldly;
|
using Shouldly;
|
||||||
using Xunit;
|
using Xunit;
|
||||||
|
using ZB.MOM.WW.OtOpcUa.Commons.Messages.Admin;
|
||||||
using ZB.MOM.WW.OtOpcUa.Commons.Messages.Deploy;
|
using ZB.MOM.WW.OtOpcUa.Commons.Messages.Deploy;
|
||||||
using ZB.MOM.WW.OtOpcUa.Commons.Messages.Fleet;
|
using ZB.MOM.WW.OtOpcUa.Commons.Messages.Fleet;
|
||||||
using ZB.MOM.WW.OtOpcUa.Commons.Types;
|
using ZB.MOM.WW.OtOpcUa.Commons.Types;
|
||||||
@@ -171,6 +172,51 @@ public sealed class DriverHostActorReconcileTests : RuntimeActorTestBase
|
|||||||
snap.Drivers[0].Name.ShouldBe("sa-modbus");
|
snap.Drivers[0].Name.ShouldBe("sa-modbus");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Regression for the Restart-driver "actor name is not unique" bug: <c>HandleRestartDriver</c>
|
||||||
|
/// stops the old child and respawns from the same spec WITHIN ONE synchronous message handler.
|
||||||
|
/// Akka frees a stopped actor's name only AFTER it fully terminates (async, on the child's own
|
||||||
|
/// mailbox), so the old child is still registered when the respawn runs — reusing the base actor
|
||||||
|
/// name threw <see cref="InvalidActorNameException"/> deterministically on every restart, crashing
|
||||||
|
/// + restarting the host. The fix gives every spawn a fresh generation-suffixed name. Here:
|
||||||
|
/// deploy a Modbus driver, restart it, and assert no such exception is logged and the child is
|
||||||
|
/// cleanly respawned (factory re-created exactly once; one child remains).
|
||||||
|
/// </summary>
|
||||||
|
[Fact]
|
||||||
|
public void RestartDriver_respawns_the_child_without_an_actor_name_collision()
|
||||||
|
{
|
||||||
|
var db = NewInMemoryDbFactory();
|
||||||
|
var factory = new CountingDriverFactory("Modbus");
|
||||||
|
var deploymentId = SeedDeploymentWithDrivers(db, RevA, ("DI-1", "Modbus", "{}", true));
|
||||||
|
|
||||||
|
var coordinator = CreateTestProbe();
|
||||||
|
var actor = Sys.ActorOf(DriverHostActor.Props(
|
||||||
|
db, TestNode, coordinator.Ref,
|
||||||
|
driverFactory: factory,
|
||||||
|
localRoles: new HashSet<string> { "driver" }));
|
||||||
|
|
||||||
|
actor.Tell(new DispatchDeployment(deploymentId, RevA, CorrelationId.NewId()));
|
||||||
|
coordinator.ExpectMsg<ApplyAck>(TimeSpan.FromSeconds(5)).Outcome.ShouldBe(ApplyAckOutcome.Applied);
|
||||||
|
AwaitAssert(() => factory.CreateCount.ShouldBe(1), duration: TimeSpan.FromSeconds(3));
|
||||||
|
|
||||||
|
// The restart must NOT throw InvalidActorNameException (which the buggy in-handler respawn did,
|
||||||
|
// crashing + restarting the host). Expect(0, …) asserts zero such logged events across a
|
||||||
|
// synchronous round-trip that guarantees the restart is fully processed.
|
||||||
|
EventFilter.Exception<InvalidActorNameException>().Expect(0, () =>
|
||||||
|
{
|
||||||
|
actor.Tell(new RestartDriver("MAIN", "DI-1", "tester", Guid.NewGuid()));
|
||||||
|
actor.Tell(new GetDiagnostics(CorrelationId.NewId()), coordinator.Ref);
|
||||||
|
coordinator.ExpectMsg<Commons.Interfaces.NodeDiagnosticsSnapshot>(TimeSpan.FromSeconds(3));
|
||||||
|
});
|
||||||
|
|
||||||
|
// The respawn re-created the driver under a fresh name — exactly once more (1 → 2), proving the
|
||||||
|
// restart completed in-handler rather than throwing — and exactly one child remains.
|
||||||
|
AwaitAssert(() => factory.CreateCount.ShouldBe(2), duration: TimeSpan.FromSeconds(3));
|
||||||
|
actor.Tell(new GetDiagnostics(CorrelationId.NewId()), coordinator.Ref);
|
||||||
|
coordinator.ExpectMsg<Commons.Interfaces.NodeDiagnosticsSnapshot>(TimeSpan.FromSeconds(2))
|
||||||
|
.Drivers.Count.ShouldBe(1);
|
||||||
|
}
|
||||||
|
|
||||||
private static DeploymentId SeedMultiClusterDeployment(
|
private static DeploymentId SeedMultiClusterDeployment(
|
||||||
IDbContextFactory<OtOpcUaConfigDbContext> db,
|
IDbContextFactory<OtOpcUaConfigDbContext> db,
|
||||||
RevisionHash rev,
|
RevisionHash rev,
|
||||||
|
|||||||
Reference in New Issue
Block a user