test(redundancy): lock in stale-Terminated guard + clarify OnTerminated (code-review)
This commit is contained in:
@@ -108,12 +108,15 @@ public sealed class PeerProbeSupervisor : ReceiveActor
|
||||
/// <param name="t">The termination notice for the dead child.</param>
|
||||
private void OnTerminated(Terminated t)
|
||||
{
|
||||
var dead = _children.FirstOrDefault(kvp => kvp.Value.Equals(t.ActorRef));
|
||||
if (!dead.Equals(default(KeyValuePair<NodeId, IActorRef>)) && dead.Value is not null)
|
||||
{
|
||||
_children.Remove(dead.Key);
|
||||
_log.Debug("PeerProbeSupervisor: pruned terminated child for peer {Peer}", dead.Key);
|
||||
}
|
||||
var entry = _children.FirstOrDefault(kvp => kvp.Value.Equals(t.ActorRef));
|
||||
// No-match yields Value == null because IActorRef is a reference type. A null here means the
|
||||
// terminated ref isn't our CURRENT child for any peer — either it was already pruned in
|
||||
// OnSnapshot (deliberate stop pre-removes from _children), or it's a stale old-generation
|
||||
// child whose peer has since been re-spawned with a fresh child. Either way: no-op, so we
|
||||
// never evict the fresh same-peer child on a stale Terminated.
|
||||
if (entry.Value is null) return;
|
||||
_children.Remove(entry.Key);
|
||||
_log.Debug("PeerProbeSupervisor: pruned terminated child for peer {Peer}", entry.Key);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
|
||||
@@ -24,6 +24,19 @@ public sealed class PeerProbeSupervisorTests : RuntimeActorTestBase
|
||||
|
||||
private static Props NoopProps() => Akka.Actor.Props.Create(() => new NoopActor());
|
||||
|
||||
/// <summary>
|
||||
/// No-op child stub that records its own <see cref="ActorBase.Self"/> into a shared list on
|
||||
/// start, in spawn order — lets a test grab a specific (e.g. the FIRST, old-generation) child
|
||||
/// ref so it can deliver a synthetic <see cref="Terminated"/> for it.
|
||||
/// </summary>
|
||||
private sealed class RecordingNoopActor : ReceiveActor
|
||||
{
|
||||
public RecordingNoopActor(List<IActorRef> spawned) => spawned.Add(Self);
|
||||
}
|
||||
|
||||
private static Props RecordingProps(List<IActorRef> spawned) =>
|
||||
Akka.Actor.Props.Create(() => new RecordingNoopActor(spawned));
|
||||
|
||||
private static NodeRedundancyState State(NodeId id, RedundancyRole role) =>
|
||||
new(id, role, IsClusterLeader: false, IsRoleLeaderForDriver: false, DateTime.UtcNow);
|
||||
|
||||
@@ -102,4 +115,44 @@ public sealed class PeerProbeSupervisorTests : RuntimeActorTestBase
|
||||
AwaitAssert(() => sup.UnderlyingActor.ChildCount.ShouldBe(1),
|
||||
duration: TimeSpan.FromMilliseconds(500));
|
||||
}
|
||||
|
||||
/// <summary>Locks in the stale-Terminated guard: when an OLD (already-replaced) child's
|
||||
/// <see cref="Terminated"/> for a peer arrives AFTER a NEW child for the SAME peer was spawned,
|
||||
/// the fresh child must NOT be evicted (removal is keyed by current-child ref-equality, not by
|
||||
/// peer key). Without this guard a late stale Terminated would silently drop a live probe.</summary>
|
||||
[Fact]
|
||||
public void Stale_terminated_for_old_child_does_not_evict_fresh_peer_child()
|
||||
{
|
||||
var spawned = new List<IActorRef>();
|
||||
var sup = ActorOfAsTestActorRef<PeerProbeSupervisor>(
|
||||
PeerProbeSupervisor.PropsForTests(Local, _ => RecordingProps(spawned)));
|
||||
|
||||
// First add of the peer -> child #0 (the OLD ref).
|
||||
sup.Tell(Snapshot(
|
||||
State(Local, RedundancyRole.Primary),
|
||||
State(Peer, RedundancyRole.Secondary)));
|
||||
AwaitAssert(() => sup.UnderlyingActor.ChildCount.ShouldBe(1),
|
||||
duration: TimeSpan.FromMilliseconds(500));
|
||||
AwaitAssert(() => spawned.Count.ShouldBe(1), duration: TimeSpan.FromMilliseconds(500));
|
||||
var oldRef = spawned[0];
|
||||
|
||||
// Drop the peer -> child #0 stopped, ChildCount back to 0.
|
||||
sup.Tell(Snapshot(State(Local, RedundancyRole.Primary)));
|
||||
AwaitAssert(() => sup.UnderlyingActor.ChildCount.ShouldBe(0),
|
||||
duration: TimeSpan.FromMilliseconds(500));
|
||||
|
||||
// Re-add the SAME peer -> a NEW child #1 (the FRESH ref) is spawned.
|
||||
sup.Tell(Snapshot(
|
||||
State(Local, RedundancyRole.Primary),
|
||||
State(Peer, RedundancyRole.Secondary)));
|
||||
AwaitAssert(() => sup.UnderlyingActor.ChildCount.ShouldBe(1),
|
||||
duration: TimeSpan.FromMilliseconds(500));
|
||||
AwaitAssert(() => spawned.Count.ShouldBe(2), duration: TimeSpan.FromMilliseconds(500));
|
||||
|
||||
// Now deliver a STALE Terminated for the OLD ref. The current child for Peer is the fresh
|
||||
// child #1, so ref-equality finds no match and the supervisor must leave ChildCount at 1.
|
||||
sup.Tell(new Terminated(oldRef, existenceConfirmed: true, addressTerminated: false));
|
||||
AwaitAssert(() => sup.UnderlyingActor.ChildCount.ShouldBe(1),
|
||||
duration: TimeSpan.FromMilliseconds(500));
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user