From c9643f68ba1cb97776a8832ce883b4d569ecaa7d Mon Sep 17 00:00:00 2001 From: Joseph Doherty Date: Mon, 15 Jun 2026 05:41:18 -0400 Subject: [PATCH] fix(runtime): restart driver no longer throws 'actor name is not unique' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit HandleRestartDriver stopped + respawned the child within one synchronous message handler, reusing the base actor name drv-. Context.Stop is async (the child processes its own stop on its own mailbox), so the old child was ALWAYS still registered when the respawn ran — Context.ActorOf threw InvalidActorNameException deterministically on every AdminUI Restart press, crashing + restarting the host. Fix: a monotonic _childSpawnGeneration counter (single-threaded actor) feeds a -g suffix on every spawned child name, so a respawn can never collide with the still-terminating predecessor. Children are tracked by the _children dict (by IActorRef), never by actor path, so the suffix is invisible to callers. This also closes the same-shaped latent race in the reconcile path (a removed- then-readded instance, and a driver-type-change ToStop+ToSpawn in one plan). Regression test RestartDriver_respawns_the_child_without_an_actor_name_collision (verified: FAILS on the old code with the exact InvalidActorNameException, PASSES with the fix). Runtime.Tests 238/238 green. Code-reviewed (approved). --- .../Drivers/DriverHostActor.cs | 24 ++++++++-- .../Drivers/DriverHostActorReconcileTests.cs | 46 +++++++++++++++++++ 2 files changed, 65 insertions(+), 5 deletions(-) diff --git a/src/Server/ZB.MOM.WW.OtOpcUa.Runtime/Drivers/DriverHostActor.cs b/src/Server/ZB.MOM.WW.OtOpcUa.Runtime/Drivers/DriverHostActor.cs index b48adf81..f272c3cb 100644 --- a/src/Server/ZB.MOM.WW.OtOpcUa.Runtime/Drivers/DriverHostActor.cs +++ b/src/Server/ZB.MOM.WW.OtOpcUa.Runtime/Drivers/DriverHostActor.cs @@ -88,6 +88,11 @@ public sealed class DriverHostActor : ReceiveActor, IWithTimers private readonly Dictionary _children = new(StringComparer.Ordinal); + // Monotonic counter feeding the child actor-name suffix (see ActorNameFor / SpawnChild). Single- + // threaded actor, so a plain increment is safe; it only ever grows, guaranteeing a unique name per + // spawn so a restart's respawn never collides with the still-terminating old child. + private long _childSpawnGeneration; + /// /// Driver live-value routing map: (DriverInstanceId, FullName) → folder-scoped equipment /// NodeId(s). Rebuilt every apply by from the @@ -984,6 +989,13 @@ public sealed class DriverHostActor : ReceiveActor, IWithTimers // identity for pre-PR artifacts that don't carry it yet (older deploys persisted before // ClusterId was added to DriverInstanceSpec). var clusterId = !string.IsNullOrEmpty(spec.ClusterId) ? spec.ClusterId : _localNode.Value; + // A fresh generation-suffixed name on EVERY spawn so a respawn can never collide with a child + // still tearing down: Akka frees a stopped actor's name only after it FULLY terminates (async), + // but HandleRestartDriver stops + respawns within one (synchronous) message handler — the old + // child is still registered, so reusing the base name throws InvalidActorNameException + // ("actor name is not unique"). Children are tracked by the _children dict (by IActorRef), never + // by path, so the suffix is invisible to every caller. + var actorName = ActorNameFor(spec.DriverInstanceId, _childSpawnGeneration++); if (stub) { child = Context.ActorOf( @@ -993,7 +1005,7 @@ public sealed class DriverHostActor : ReceiveActor, IWithTimers startStubbed: true, healthPublisher: _healthPublisher, clusterId: clusterId), - ActorNameFor(spec.DriverInstanceId)); + actorName); } else { @@ -1002,7 +1014,7 @@ public sealed class DriverHostActor : ReceiveActor, IWithTimers driver!, healthPublisher: _healthPublisher, clusterId: clusterId), - ActorNameFor(spec.DriverInstanceId)); + actorName); child.Tell(new DriverInstanceActor.InitializeRequested(spec.DriverConfig)); } @@ -1028,11 +1040,13 @@ public sealed class DriverHostActor : ReceiveActor, IWithTimers _log.Info("DriverHost {Node}: stopped driver child {Id}", _localNode, driverInstanceId); } - private static string ActorNameFor(string driverInstanceId) + private static string ActorNameFor(string driverInstanceId, long generation) { - // Akka actor names cannot contain '/', ':', or whitespace. Mangle defensively. + // Akka actor names cannot contain '/', ':', or whitespace. Mangle defensively. The monotonic + // generation suffix guarantees a never-before-used name on every spawn (see SpawnChild) so a + // restart's respawn can never collide with the still-terminating predecessor. var chars = driverInstanceId.Select(c => char.IsLetterOrDigit(c) || c is '-' or '_' or '.' ? c : '_').ToArray(); - return "drv-" + new string(chars); + return "drv-" + new string(chars) + "-g" + generation; } /// diff --git a/tests/Server/ZB.MOM.WW.OtOpcUa.Runtime.Tests/Drivers/DriverHostActorReconcileTests.cs b/tests/Server/ZB.MOM.WW.OtOpcUa.Runtime.Tests/Drivers/DriverHostActorReconcileTests.cs index f6c8a709..56a85fb8 100644 --- a/tests/Server/ZB.MOM.WW.OtOpcUa.Runtime.Tests/Drivers/DriverHostActorReconcileTests.cs +++ b/tests/Server/ZB.MOM.WW.OtOpcUa.Runtime.Tests/Drivers/DriverHostActorReconcileTests.cs @@ -3,6 +3,7 @@ using Akka.Actor; using Microsoft.EntityFrameworkCore; using Shouldly; using Xunit; +using ZB.MOM.WW.OtOpcUa.Commons.Messages.Admin; using ZB.MOM.WW.OtOpcUa.Commons.Messages.Deploy; using ZB.MOM.WW.OtOpcUa.Commons.Messages.Fleet; using ZB.MOM.WW.OtOpcUa.Commons.Types; @@ -171,6 +172,51 @@ public sealed class DriverHostActorReconcileTests : RuntimeActorTestBase snap.Drivers[0].Name.ShouldBe("sa-modbus"); } + /// + /// Regression for the Restart-driver "actor name is not unique" bug: HandleRestartDriver + /// stops the old child and respawns from the same spec WITHIN ONE synchronous message handler. + /// Akka frees a stopped actor's name only AFTER it fully terminates (async, on the child's own + /// mailbox), so the old child is still registered when the respawn runs — reusing the base actor + /// name threw deterministically on every restart, crashing + /// + restarting the host. The fix gives every spawn a fresh generation-suffixed name. Here: + /// deploy a Modbus driver, restart it, and assert no such exception is logged and the child is + /// cleanly respawned (factory re-created exactly once; one child remains). + /// + [Fact] + public void RestartDriver_respawns_the_child_without_an_actor_name_collision() + { + var db = NewInMemoryDbFactory(); + var factory = new CountingDriverFactory("Modbus"); + var deploymentId = SeedDeploymentWithDrivers(db, RevA, ("DI-1", "Modbus", "{}", true)); + + var coordinator = CreateTestProbe(); + var actor = Sys.ActorOf(DriverHostActor.Props( + db, TestNode, coordinator.Ref, + driverFactory: factory, + localRoles: new HashSet { "driver" })); + + actor.Tell(new DispatchDeployment(deploymentId, RevA, CorrelationId.NewId())); + coordinator.ExpectMsg(TimeSpan.FromSeconds(5)).Outcome.ShouldBe(ApplyAckOutcome.Applied); + AwaitAssert(() => factory.CreateCount.ShouldBe(1), duration: TimeSpan.FromSeconds(3)); + + // The restart must NOT throw InvalidActorNameException (which the buggy in-handler respawn did, + // crashing + restarting the host). Expect(0, …) asserts zero such logged events across a + // synchronous round-trip that guarantees the restart is fully processed. + EventFilter.Exception().Expect(0, () => + { + actor.Tell(new RestartDriver("MAIN", "DI-1", "tester", Guid.NewGuid())); + actor.Tell(new GetDiagnostics(CorrelationId.NewId()), coordinator.Ref); + coordinator.ExpectMsg(TimeSpan.FromSeconds(3)); + }); + + // The respawn re-created the driver under a fresh name — exactly once more (1 → 2), proving the + // restart completed in-handler rather than throwing — and exactly one child remains. + AwaitAssert(() => factory.CreateCount.ShouldBe(2), duration: TimeSpan.FromSeconds(3)); + actor.Tell(new GetDiagnostics(CorrelationId.NewId()), coordinator.Ref); + coordinator.ExpectMsg(TimeSpan.FromSeconds(2)) + .Drivers.Count.ShouldBe(1); + } + private static DeploymentId SeedMultiClusterDeployment( IDbContextFactory db, RevisionHash rev,