diff --git a/src/Server/ZB.MOM.WW.OtOpcUa.Runtime/Health/PeerProbeSupervisor.cs b/src/Server/ZB.MOM.WW.OtOpcUa.Runtime/Health/PeerProbeSupervisor.cs index 72c9f9f9..76efaddd 100644 --- a/src/Server/ZB.MOM.WW.OtOpcUa.Runtime/Health/PeerProbeSupervisor.cs +++ b/src/Server/ZB.MOM.WW.OtOpcUa.Runtime/Health/PeerProbeSupervisor.cs @@ -108,12 +108,15 @@ public sealed class PeerProbeSupervisor : ReceiveActor /// The termination notice for the dead child. private void OnTerminated(Terminated t) { - var dead = _children.FirstOrDefault(kvp => kvp.Value.Equals(t.ActorRef)); - if (!dead.Equals(default(KeyValuePair)) && dead.Value is not null) - { - _children.Remove(dead.Key); - _log.Debug("PeerProbeSupervisor: pruned terminated child for peer {Peer}", dead.Key); - } + var entry = _children.FirstOrDefault(kvp => kvp.Value.Equals(t.ActorRef)); + // No-match yields Value == null because IActorRef is a reference type. A null here means the + // terminated ref isn't our CURRENT child for any peer — either it was already pruned in + // OnSnapshot (deliberate stop pre-removes from _children), or it's a stale old-generation + // child whose peer has since been re-spawned with a fresh child. Either way: no-op, so we + // never evict the fresh same-peer child on a stale Terminated. + if (entry.Value is null) return; + _children.Remove(entry.Key); + _log.Debug("PeerProbeSupervisor: pruned terminated child for peer {Peer}", entry.Key); } /// diff --git a/tests/Server/ZB.MOM.WW.OtOpcUa.Runtime.Tests/Health/PeerProbeSupervisorTests.cs b/tests/Server/ZB.MOM.WW.OtOpcUa.Runtime.Tests/Health/PeerProbeSupervisorTests.cs index ebbd5e2d..f65136c4 100644 --- a/tests/Server/ZB.MOM.WW.OtOpcUa.Runtime.Tests/Health/PeerProbeSupervisorTests.cs +++ b/tests/Server/ZB.MOM.WW.OtOpcUa.Runtime.Tests/Health/PeerProbeSupervisorTests.cs @@ -24,6 +24,19 @@ public sealed class PeerProbeSupervisorTests : RuntimeActorTestBase private static Props NoopProps() => Akka.Actor.Props.Create(() => new NoopActor()); + /// + /// No-op child stub that records its own into a shared list on + /// start, in spawn order — lets a test grab a specific (e.g. the FIRST, old-generation) child + /// ref so it can deliver a synthetic for it. + /// + private sealed class RecordingNoopActor : ReceiveActor + { + public RecordingNoopActor(List spawned) => spawned.Add(Self); + } + + private static Props RecordingProps(List spawned) => + Akka.Actor.Props.Create(() => new RecordingNoopActor(spawned)); + private static NodeRedundancyState State(NodeId id, RedundancyRole role) => new(id, role, IsClusterLeader: false, IsRoleLeaderForDriver: false, DateTime.UtcNow); @@ -102,4 +115,44 @@ public sealed class PeerProbeSupervisorTests : RuntimeActorTestBase AwaitAssert(() => sup.UnderlyingActor.ChildCount.ShouldBe(1), duration: TimeSpan.FromMilliseconds(500)); } + + /// Locks in the stale-Terminated guard: when an OLD (already-replaced) child's + /// for a peer arrives AFTER a NEW child for the SAME peer was spawned, + /// the fresh child must NOT be evicted (removal is keyed by current-child ref-equality, not by + /// peer key). Without this guard a late stale Terminated would silently drop a live probe. + [Fact] + public void Stale_terminated_for_old_child_does_not_evict_fresh_peer_child() + { + var spawned = new List(); + var sup = ActorOfAsTestActorRef( + PeerProbeSupervisor.PropsForTests(Local, _ => RecordingProps(spawned))); + + // First add of the peer -> child #0 (the OLD ref). + sup.Tell(Snapshot( + State(Local, RedundancyRole.Primary), + State(Peer, RedundancyRole.Secondary))); + AwaitAssert(() => sup.UnderlyingActor.ChildCount.ShouldBe(1), + duration: TimeSpan.FromMilliseconds(500)); + AwaitAssert(() => spawned.Count.ShouldBe(1), duration: TimeSpan.FromMilliseconds(500)); + var oldRef = spawned[0]; + + // Drop the peer -> child #0 stopped, ChildCount back to 0. + sup.Tell(Snapshot(State(Local, RedundancyRole.Primary))); + AwaitAssert(() => sup.UnderlyingActor.ChildCount.ShouldBe(0), + duration: TimeSpan.FromMilliseconds(500)); + + // Re-add the SAME peer -> a NEW child #1 (the FRESH ref) is spawned. + sup.Tell(Snapshot( + State(Local, RedundancyRole.Primary), + State(Peer, RedundancyRole.Secondary))); + AwaitAssert(() => sup.UnderlyingActor.ChildCount.ShouldBe(1), + duration: TimeSpan.FromMilliseconds(500)); + AwaitAssert(() => spawned.Count.ShouldBe(2), duration: TimeSpan.FromMilliseconds(500)); + + // Now deliver a STALE Terminated for the OLD ref. The current child for Peer is the fresh + // child #1, so ref-equality finds no match and the supervisor must leave ChildCount at 1. + sup.Tell(new Terminated(oldRef, existenceConfirmed: true, addressTerminated: false)); + AwaitAssert(() => sup.UnderlyingActor.ChildCount.ShouldBe(1), + duration: TimeSpan.FromMilliseconds(500)); + } }