feat(runtime): PeerOpcUaProbeActor real TCP-connect probe (F12)

Replaces the Ok=true stub with a TCP connect to the peer's OPC UA port (4840
default) with a 2s timeout. A successful connect indicates the OPC UA server
process is up + accepting connections — enough for the redundancy calculator
to treat the peer as live. A full secure-channel Hello/Acknowledge handshake
is overkill for what the redundancy calc consumes and would pull in the OPC
UA Client SDK + a PKI setup. Upgrade later if a deeper liveness signal is ever
required.

Probe extracts the host from NodeId by stripping the :port suffix (commit
5cfbe8b encoded host:port into NodeId for cluster-member identity).

Tests: 2 new tests — Ok=true against a live TcpListener on a chosen port,
Ok=false against an unreachable endpoint. All 17 Runtime tests pass (was 16
covering only the message-contract surface).
This commit is contained in:
Joseph Doherty
2026-05-26 06:54:51 -04:00
parent f57f61deac
commit b06e3ae740
3 changed files with 96 additions and 16 deletions

View File

@@ -1,3 +1,4 @@
using System.Net.Sockets;
using Akka.Actor;
using Akka.Cluster.Tools.PublishSubscribe;
using Akka.Event;
@@ -8,7 +9,13 @@ namespace ZB.MOM.WW.OtOpcUa.Runtime.Health;
/// <summary>
/// Periodically pings a peer node's OPC UA endpoint (<c>opc.tcp://peer:4840</c>) and publishes
/// the result on the cluster's redundancy-state input topic so the admin <c>RedundancyStateActor</c>
/// can react. Real OPC UA probe call is staged for follow-up F12.
/// can react.
///
/// The probe is a plain TCP connect to the OPC UA port with a short timeout — enough to detect
/// "is the OPC UA server process up and accepting connections?" A full secure-channel handshake
/// (Hello / Acknowledge) needs the OPC UA Client SDK and a session/PKI setup, which is more than
/// what the redundancy calculator needs. Upgrade to a real Hello probe if a deeper liveness signal
/// is ever required.
/// </summary>
public sealed class PeerOpcUaProbeActor : ReceiveActor, IWithTimers
{
@@ -17,38 +24,87 @@ public sealed class PeerOpcUaProbeActor : ReceiveActor, IWithTimers
public const string RedundancyStateTopic = "redundancy-state";
public static readonly TimeSpan DefaultProbeInterval = TimeSpan.FromSeconds(10);
public static readonly TimeSpan DefaultConnectTimeout = TimeSpan.FromSeconds(2);
public const int DefaultOpcUaPort = 4840;
public sealed record OpcUaProbeResult(NodeId NodeId, bool Ok);
public sealed class Tick { public static readonly Tick Instance = new(); private Tick() { } }
private readonly NodeId _peer;
private readonly TimeSpan _interval;
private readonly TimeSpan _connectTimeout;
private readonly int _opcUaPort;
private readonly Action<object>? _broadcastOverride;
private readonly ILoggingAdapter _log = Context.GetLogger();
public ITimerScheduler Timers { get; set; } = null!;
public static Props Props(NodeId peer, TimeSpan? interval = null, Action<object>? broadcast = null) =>
Akka.Actor.Props.Create(() => new PeerOpcUaProbeActor(peer, interval ?? DefaultProbeInterval, broadcast));
public static Props Props(
NodeId peer,
TimeSpan? interval = null,
TimeSpan? connectTimeout = null,
int opcUaPort = DefaultOpcUaPort,
Action<object>? broadcast = null) =>
Akka.Actor.Props.Create(() => new PeerOpcUaProbeActor(
peer,
interval ?? DefaultProbeInterval,
connectTimeout ?? DefaultConnectTimeout,
opcUaPort,
broadcast));
public PeerOpcUaProbeActor(NodeId peer, TimeSpan interval, Action<object>? broadcastOverride)
public PeerOpcUaProbeActor(
NodeId peer,
TimeSpan interval,
TimeSpan connectTimeout,
int opcUaPort,
Action<object>? broadcastOverride)
{
_peer = peer;
_interval = interval;
_connectTimeout = connectTimeout;
_opcUaPort = opcUaPort;
_broadcastOverride = broadcastOverride;
Receive<Tick>(_ => RunProbe());
ReceiveAsync<Tick>(_ => RunProbeAsync());
}
protected override void PreStart() =>
Timers.StartPeriodicTimer("probe", Tick.Instance, _interval);
private void RunProbe()
private async Task RunProbeAsync()
{
// F12: actual opc.tcp ping. Assume Ok=true until the probe is wired.
var msg = new OpcUaProbeResult(_peer, Ok: true);
var host = ExtractHost(_peer);
var ok = await TryTcpConnectAsync(host, _opcUaPort, _connectTimeout);
var msg = new OpcUaProbeResult(_peer, ok);
if (_broadcastOverride is not null) _broadcastOverride(msg);
else DistributedPubSub.Get(Context.System).Mediator.Tell(new Publish(RedundancyStateTopic, msg));
_log.Debug("PeerOpcUaProbe: pinged {Peer} (probe staged for F12)", _peer);
_log.Debug("PeerOpcUaProbe: pinged {Peer} ({Host}:{Port}) → ok={Ok}",
_peer, host, _opcUaPort, ok);
}
/// <summary>
/// <see cref="NodeId"/> values are derived as <c>host:port</c> (see <c>ClusterRoleInfo</c>).
/// Strip the port suffix to get the hostname for the TCP connect.
/// </summary>
private static string ExtractHost(NodeId nodeId)
{
var s = nodeId.Value;
var colon = s.LastIndexOf(':');
return colon > 0 ? s[..colon] : s;
}
private static async Task<bool> TryTcpConnectAsync(string host, int port, TimeSpan timeout)
{
try
{
using var client = new TcpClient();
using var cts = new CancellationTokenSource(timeout);
await client.ConnectAsync(host, port, cts.Token);
return client.Connected;
}
catch (Exception)
{
return false;
}
}
}