feat(redundancy): OpcUaProbeOk from peer-probes-me with freshness debounce

This commit is contained in:
Joseph Doherty
2026-06-15 13:04:41 -04:00
parent a9ff1a64b2
commit cf278035d2
2 changed files with 162 additions and 6 deletions
@@ -59,6 +59,7 @@ public sealed class OpcUaPublishActor : ReceiveActor
private readonly Phase7Applier? _applier;
private readonly IActorRef? _dbHealthProbe;
private readonly TimeSpan _staleWindow;
private TimeSpan _probeFreshnessWindow;
private readonly Akka.Cluster.Cluster _cluster = Akka.Cluster.Cluster.Get(Context.System);
private readonly ILoggingAdapter _log = Context.GetLogger();
@@ -67,6 +68,7 @@ public sealed class OpcUaPublishActor : ReceiveActor
private bool _publishedAtLeastOnce;
private DbHealthProbeActor.DbHealthStatus? _lastDbHealth;
private RedundancyStateChanged? _lastSnapshot;
private (bool Ok, DateTime At)? _probeAboutMe;
private Phase7CompositionResult _lastApplied = new(
Array.Empty<UnsAreaProjection>(),
Array.Empty<UnsLineProjection>(),
@@ -95,6 +97,9 @@ public sealed class OpcUaPublishActor : ReceiveActor
/// legacy role-only ServiceLevel seam is used until a <see cref="DbHealthProbeActor.DbHealthStatus"/> arrives.</param>
/// <param name="staleWindow">The window beyond which a DB-health sample or redundancy snapshot is
/// considered stale; defaults to 30 seconds.</param>
/// <param name="probeFreshnessWindow">The window beyond which a peer's OPC UA probe verdict about
/// this node is considered stale (and thus given the benefit of the doubt rather than demoting);
/// defaults to 30 seconds.</param>
public static Props Props(
IOpcUaAddressSpaceSink? sink = null,
IServiceLevelPublisher? serviceLevel = null,
@@ -102,7 +107,8 @@ public sealed class OpcUaPublishActor : ReceiveActor
IDbContextFactory<OtOpcUaConfigDbContext>? dbFactory = null,
Phase7Applier? applier = null,
IActorRef? dbHealthProbe = null,
TimeSpan? staleWindow = null) =>
TimeSpan? staleWindow = null,
TimeSpan? probeFreshnessWindow = null) =>
Akka.Actor.Props.Create(() => new OpcUaPublishActor(
sink ?? NullOpcUaAddressSpaceSink.Instance,
serviceLevel ?? NullServiceLevelPublisher.Instance,
@@ -111,7 +117,8 @@ public sealed class OpcUaPublishActor : ReceiveActor
dbFactory,
applier,
dbHealthProbe,
staleWindow)).WithDispatcher(DispatcherId);
staleWindow,
probeFreshnessWindow)).WithDispatcher(DispatcherId);
/// <summary>Test-only Props that omits the pinned-dispatcher requirement and skips the
/// DPS subscribe so unit tests can spin up the actor on a vanilla TestKit cluster.</summary>
@@ -125,6 +132,9 @@ public sealed class OpcUaPublishActor : ReceiveActor
/// legacy role-only ServiceLevel seam is used until a <see cref="DbHealthProbeActor.DbHealthStatus"/> arrives.</param>
/// <param name="staleWindow">The window beyond which a DB-health sample or redundancy snapshot is
/// considered stale; defaults to 30 seconds.</param>
/// <param name="probeFreshnessWindow">The window beyond which a peer's OPC UA probe verdict about
/// this node is considered stale (and thus given the benefit of the doubt rather than demoting);
/// defaults to 30 seconds.</param>
public static Props PropsForTests(
IOpcUaAddressSpaceSink? sink = null,
IServiceLevelPublisher? serviceLevel = null,
@@ -133,7 +143,8 @@ public sealed class OpcUaPublishActor : ReceiveActor
IDbContextFactory<OtOpcUaConfigDbContext>? dbFactory = null,
Phase7Applier? applier = null,
IActorRef? dbHealthProbe = null,
TimeSpan? staleWindow = null) =>
TimeSpan? staleWindow = null,
TimeSpan? probeFreshnessWindow = null) =>
Akka.Actor.Props.Create(() => new OpcUaPublishActor(
sink ?? NullOpcUaAddressSpaceSink.Instance,
serviceLevel ?? NullServiceLevelPublisher.Instance,
@@ -142,7 +153,8 @@ public sealed class OpcUaPublishActor : ReceiveActor
dbFactory,
applier,
dbHealthProbe,
staleWindow));
staleWindow,
probeFreshnessWindow));
/// <summary>Initializes a new instance of the <see cref="OpcUaPublishActor"/> class.</summary>
/// <param name="sink">The OPC UA address space sink.</param>
@@ -155,6 +167,9 @@ public sealed class OpcUaPublishActor : ReceiveActor
/// legacy role-only ServiceLevel seam is used until a <see cref="DbHealthProbeActor.DbHealthStatus"/> arrives.</param>
/// <param name="staleWindow">The window beyond which a DB-health sample or redundancy snapshot is
/// considered stale; defaults to 30 seconds.</param>
/// <param name="probeFreshnessWindow">The window beyond which a peer's OPC UA probe verdict about
/// this node is considered stale (and thus given the benefit of the doubt rather than demoting);
/// defaults to 30 seconds.</param>
public OpcUaPublishActor(
IOpcUaAddressSpaceSink sink,
IServiceLevelPublisher serviceLevel,
@@ -163,7 +178,8 @@ public sealed class OpcUaPublishActor : ReceiveActor
IDbContextFactory<OtOpcUaConfigDbContext>? dbFactory = null,
Phase7Applier? applier = null,
IActorRef? dbHealthProbe = null,
TimeSpan? staleWindow = null)
TimeSpan? staleWindow = null,
TimeSpan? probeFreshnessWindow = null)
{
_sink = sink;
_serviceLevel = serviceLevel;
@@ -173,6 +189,7 @@ public sealed class OpcUaPublishActor : ReceiveActor
_applier = applier;
_dbHealthProbe = dbHealthProbe;
_staleWindow = staleWindow ?? TimeSpan.FromSeconds(30);
_probeFreshnessWindow = probeFreshnessWindow ?? TimeSpan.FromSeconds(30);
Receive<AttributeValueUpdate>(HandleAttributeUpdate);
Receive<AlarmStateUpdate>(HandleAlarmUpdate);
@@ -180,6 +197,7 @@ public sealed class OpcUaPublishActor : ReceiveActor
Receive<ServiceLevelChanged>(HandleServiceLevelChanged);
Receive<RedundancyStateChanged>(HandleRedundancyStateChanged);
Receive<DbHealthProbeActor.DbHealthStatus>(HandleDbHealthStatus);
Receive<PeerOpcUaProbeActor.OpcUaProbeResult>(HandlePeerProbe);
Receive<SubscribeAck>(_ => { /* PubSub ack */ });
}
@@ -378,6 +396,31 @@ public sealed class OpcUaPublishActor : ReceiveActor
RecomputeServiceLevel();
}
/// <summary>Records a peer's OPC UA probe verdict about THIS node and recomputes the local
/// ServiceLevel. The probe's <see cref="PeerOpcUaProbeActor.OpcUaProbeResult.NodeId"/> is the
/// target that was probed, so a result whose <c>NodeId</c> is not this node is about a peer and
/// is ignored. A matching result is stamped with the receive time so <see cref="OpcUaProbeOk"/>
/// can debounce stale verdicts.</summary>
private void HandlePeerProbe(PeerOpcUaProbeActor.OpcUaProbeResult r)
{
// The result targets the probed node. If it isn't me, it's about a peer — ignore it.
if (_localNode is null || r.NodeId != _localNode.Value) return;
_probeAboutMe = (r.Ok, DateTime.UtcNow);
RecomputeServiceLevel();
}
/// <summary>The OPC UA self-probe input for the calculator: "did a peer recently observe MY OPC UA
/// endpoint as reachable?" Returns <c>true</c> (benefit of the doubt) when no peer verdict has
/// arrived yet (single-node / no peer) or when the latest verdict is older than
/// <see cref="_probeFreshnessWindow"/> (the peer went away — don't penalise this node for that).
/// Only an actively-observed, RECENT <c>Ok==false</c> demotes.</summary>
private bool OpcUaProbeOk()
{
if (_probeAboutMe is not { } verdict) return true;
if (DateTime.UtcNow - verdict.At > _probeFreshnessWindow) return true;
return verdict.Ok;
}
/// <summary>
/// Computes the local OPC UA ServiceLevel and routes it through <see cref="ServiceLevelChanged"/>
/// (the dedup/publish/metric handler). The full <see cref="ServiceLevelCalculator"/> path is
@@ -413,7 +456,7 @@ public sealed class OpcUaPublishActor : ReceiveActor
var inputs = new NodeHealthInputs(
MemberState: SafeSelfStatus(),
DbReachable: _lastDbHealth.Reachable,
OpcUaProbeOk: true, // TODO(2b): wire the real OPC UA self-probe result here.
OpcUaProbeOk: OpcUaProbeOk(),
Stale: !_lastDbHealth.Reachable
|| (now - _lastDbHealth.AsOfUtc) > _staleWindow
|| (now - entry.AsOfUtc) > _staleWindow,