feat(redundancy): PeerProbeSupervisor maintains one peer OPC UA probe per driver peer
This commit is contained in:
@@ -0,0 +1,127 @@
|
||||
using Akka.Actor;
|
||||
using Akka.Cluster.Tools.PublishSubscribe;
|
||||
using Akka.Event;
|
||||
using ZB.MOM.WW.OtOpcUa.Commons.Messages.Redundancy;
|
||||
using ZB.MOM.WW.OtOpcUa.Commons.Types;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Runtime.Health;
|
||||
|
||||
/// <summary>
|
||||
/// Per-node supervisor that keeps exactly one <see cref="PeerOpcUaProbeActor"/> child per OTHER,
|
||||
/// non-<see cref="RedundancyRole.Detached"/> driver node named in the latest
|
||||
/// <see cref="RedundancyStateChanged"/> snapshot. Because every node runs this supervisor, each
|
||||
/// node is continuously TCP-probed by all of its peers; a node learns its own reachability by
|
||||
/// consuming the resulting <see cref="PeerOpcUaProbeActor.OpcUaProbeResult"/> messages in
|
||||
/// <c>OpcUaPublishActor</c>.
|
||||
///
|
||||
/// The supervisor reacts to the published <see cref="RedundancyStateChanged"/> snapshot (a plain
|
||||
/// record on the <see cref="PeerOpcUaProbeActor.RedundancyStateTopic"/> DistributedPubSub topic)
|
||||
/// rather than to raw Akka <c>Member</c> events, so the desired peer set is derived from one
|
||||
/// authoritative message instead of being reconstructed from membership churn.
|
||||
/// </summary>
|
||||
public sealed class PeerProbeSupervisor : ReceiveActor
|
||||
{
|
||||
private readonly NodeId _localNode;
|
||||
private readonly Func<NodeId, Props> _probeFactory;
|
||||
private readonly Dictionary<NodeId, IActorRef> _children = new();
|
||||
private readonly ILoggingAdapter _log = Context.GetLogger();
|
||||
|
||||
/// <summary>
|
||||
/// Monotonic generation counter appended to each spawned child's name. Guarantees a freshly
|
||||
/// re-added peer never collides on a child name with a same-peer child that is still
|
||||
/// asynchronously terminating after a <see cref="ActorBase.Context"/> <c>Stop</c>.
|
||||
/// </summary>
|
||||
private long _generation;
|
||||
|
||||
/// <summary>Creates production props that probe peers on the given OPC UA port.</summary>
|
||||
/// <param name="localNode">This node's identifier; excluded from the probed peer set.</param>
|
||||
/// <param name="opcUaPort">The OPC UA port each peer probe connects to.</param>
|
||||
/// <returns>Props configured to create a <see cref="PeerProbeSupervisor"/>.</returns>
|
||||
public static Props Props(NodeId localNode, int opcUaPort = PeerOpcUaProbeActor.DefaultOpcUaPort)
|
||||
{
|
||||
// Build the factory delegate OUTSIDE the Props.Create expression tree: a named argument
|
||||
// (opcUaPort:) is not permitted inside an expression tree, so capture it in a local first.
|
||||
Func<NodeId, Props> probeFactory = peer => PeerOpcUaProbeActor.Props(peer, opcUaPort: opcUaPort);
|
||||
return Akka.Actor.Props.Create(() => new PeerProbeSupervisor(localNode, probeFactory));
|
||||
}
|
||||
|
||||
/// <summary>Creates props with an injected probe factory for tests (e.g. no-op children).</summary>
|
||||
/// <param name="localNode">This node's identifier; excluded from the probed peer set.</param>
|
||||
/// <param name="probeFactory">Factory producing the child <see cref="Props"/> for a given peer.</param>
|
||||
/// <returns>Props configured to create a <see cref="PeerProbeSupervisor"/>.</returns>
|
||||
public static Props PropsForTests(NodeId localNode, Func<NodeId, Props> probeFactory) =>
|
||||
Akka.Actor.Props.Create(() => new PeerProbeSupervisor(localNode, probeFactory));
|
||||
|
||||
/// <summary>Initializes a new instance of the <see cref="PeerProbeSupervisor"/> class.</summary>
|
||||
/// <param name="localNode">This node's identifier; excluded from the probed peer set.</param>
|
||||
/// <param name="probeFactory">Factory producing the child <see cref="Props"/> for a given peer.</param>
|
||||
public PeerProbeSupervisor(NodeId localNode, Func<NodeId, Props> probeFactory)
|
||||
{
|
||||
_localNode = localNode;
|
||||
_probeFactory = probeFactory;
|
||||
|
||||
Receive<RedundancyStateChanged>(OnSnapshot);
|
||||
Receive<Terminated>(OnTerminated);
|
||||
Receive<SubscribeAck>(_ => { });
|
||||
}
|
||||
|
||||
/// <summary>Gets the current number of live peer-probe children.</summary>
|
||||
public int ChildCount => _children.Count;
|
||||
|
||||
/// <inheritdoc />
|
||||
protected override void PreStart() =>
|
||||
DistributedPubSub.Get(Context.System).Mediator.Tell(
|
||||
new Subscribe(PeerOpcUaProbeActor.RedundancyStateTopic, Self));
|
||||
|
||||
/// <summary>
|
||||
/// Reconciles the live child set against the desired peer set in the snapshot: stops children
|
||||
/// for peers that are gone (or became Detached / self), and spawns a probe for each newly
|
||||
/// desired peer.
|
||||
/// </summary>
|
||||
/// <param name="s">The latest redundancy-state snapshot.</param>
|
||||
private void OnSnapshot(RedundancyStateChanged s)
|
||||
{
|
||||
var want = s.Nodes
|
||||
.Where(n => n.NodeId != _localNode && n.Role != RedundancyRole.Detached)
|
||||
.Select(n => n.NodeId)
|
||||
.ToHashSet();
|
||||
|
||||
foreach (var gone in _children.Keys.Where(k => !want.Contains(k)).ToList())
|
||||
{
|
||||
Context.Stop(_children[gone]);
|
||||
_children.Remove(gone);
|
||||
}
|
||||
|
||||
foreach (var peer in want.Where(p => !_children.ContainsKey(p)))
|
||||
{
|
||||
var name = "probe-" + Sanitize(peer.Value) + "-" + _generation++;
|
||||
var child = Context.ActorOf(_probeFactory(peer), name);
|
||||
Context.Watch(child);
|
||||
_children[peer] = child;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Prunes a child that has terminated for any reason (e.g. crash) so <see cref="ChildCount"/>
|
||||
/// stays accurate and the next snapshot will respawn it if still desired.
|
||||
/// </summary>
|
||||
/// <param name="t">The termination notice for the dead child.</param>
|
||||
private void OnTerminated(Terminated t)
|
||||
{
|
||||
var dead = _children.FirstOrDefault(kvp => kvp.Value.Equals(t.ActorRef));
|
||||
if (!dead.Equals(default(KeyValuePair<NodeId, IActorRef>)) && dead.Value is not null)
|
||||
{
|
||||
_children.Remove(dead.Key);
|
||||
_log.Debug("PeerProbeSupervisor: pruned terminated child for peer {Peer}", dead.Key);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Sanitizes a <see cref="NodeId"/> value into a legal Akka child-name fragment — Akka actor
|
||||
/// names may not contain <c>:</c> or <c>.</c>, which appear in <c>host:port</c> node ids.
|
||||
/// </summary>
|
||||
/// <param name="s">The raw node-id value.</param>
|
||||
/// <returns>The value with every non-alphanumeric character replaced by <c>-</c>.</returns>
|
||||
private static string Sanitize(string s) =>
|
||||
new string(s.Select(c => char.IsLetterOrDigit(c) ? c : '-').ToArray());
|
||||
}
|
||||
Reference in New Issue
Block a user