feat(redundancy): gate scripted-alarm alerts publish on Primary (A1)

This commit is contained in:
Joseph Doherty
2026-06-11 08:44:44 -04:00
parent 535787bd85
commit 06c415598c
3 changed files with 186 additions and 6 deletions
@@ -280,7 +280,7 @@ public sealed class DriverHostActor : ReceiveActor, IWithTimers
var engine = new ScriptedAlarmEngine(
upstream, store, new ScriptLoggerFactory(_scriptRootLogger.Logger), _scriptRootLogger.Logger);
_scriptedAlarmHost = Context.ActorOf(
ScriptedAlarmHostActor.Props(_opcUaPublishActor, _dependencyMux, upstream, engine),
ScriptedAlarmHostActor.Props(_opcUaPublishActor, _dependencyMux, upstream, engine, _localNode),
"scripted-alarm-host");
}
@@ -2,7 +2,9 @@ using Akka.Actor;
using Akka.Cluster.Tools.PublishSubscribe;
using Akka.Event;
using ZB.MOM.WW.OtOpcUa.Commons.Messages.Alerts;
using ZB.MOM.WW.OtOpcUa.Commons.Messages.Redundancy;
using ZB.MOM.WW.OtOpcUa.Commons.OpcUa;
using ZB.MOM.WW.OtOpcUa.Commons.Types;
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
using ZB.MOM.WW.OtOpcUa.Core.ScriptedAlarms;
using ZB.MOM.WW.OtOpcUa.OpcUaServer;
@@ -95,10 +97,17 @@ public sealed class ScriptedAlarmHostActor : ReceiveActor
private readonly IActorRef? _mux;
private readonly DependencyMuxTagUpstreamSource _upstream;
private readonly ScriptedAlarmEngine _engine;
private readonly NodeId? _localNode;
private readonly ILoggingAdapter _log = Context.GetLogger();
private readonly CancellationTokenSource _cts = new();
private readonly EventHandler<ScriptedAlarmEvent> _onEngineEvent;
/// <summary>Cached local <see cref="RedundancyRole"/> from the latest <see cref="RedundancyStateChanged"/>
/// snapshot (null = unknown until the first snapshot arrives, or no <see cref="_localNode"/> wired). The
/// cluster-wide <c>alerts</c> publish in <see cref="OnEngineEmission"/> is gated on this: only the Primary
/// publishes (default-emit while unknown so single-node deploys + the boot window never drop transitions).</summary>
private RedundancyRole? _localRole;
/// <summary>Monotonic load generation, bumped on every <see cref="OnApply"/>. The continuation that
/// pipes back an <see cref="AlarmsLoaded"/> captures the generation it was started under; a stale
/// completion (an earlier generation arriving after a newer apply) is discarded in
@@ -118,23 +127,31 @@ public sealed class ScriptedAlarmHostActor : ReceiveActor
/// <param name="upstream">The mux-fed upstream the engine reads + subscribes from. MUST be the
/// same instance the <paramref name="engine"/> was constructed around.</param>
/// <param name="engine">The scripted-alarm engine this host owns + disposes.</param>
/// <param name="localNode">The local cluster node id, used to read this node's <see cref="RedundancyRole"/>
/// from the <c>redundancy-state</c> topic so only the Primary publishes the cluster-wide <c>alerts</c>
/// transition. Null (the default) leaves the role unknown ⇒ default-emit (single-node deploys + tests).</param>
public static Props Props(
IActorRef publishActor,
IActorRef? mux,
DependencyMuxTagUpstreamSource upstream,
ScriptedAlarmEngine engine) =>
Akka.Actor.Props.Create(() => new ScriptedAlarmHostActor(publishActor, mux, upstream, engine));
ScriptedAlarmEngine engine,
NodeId? localNode = null) =>
Akka.Actor.Props.Create(() => new ScriptedAlarmHostActor(publishActor, mux, upstream, engine, localNode));
/// <summary>Initializes a new instance of the <see cref="ScriptedAlarmHostActor"/> class.</summary>
/// <param name="publishActor">The OPC UA publish actor emissions are bridged to.</param>
/// <param name="mux">Optional dependency multiplexer the host registers dependency interest with.</param>
/// <param name="upstream">The mux-fed upstream the engine reads + subscribes from (same instance the engine wraps).</param>
/// <param name="engine">The scripted-alarm engine this host owns + disposes.</param>
/// <param name="localNode">The local cluster node id, used to read this node's <see cref="RedundancyRole"/>
/// from the <c>redundancy-state</c> topic so only the Primary publishes the cluster-wide <c>alerts</c>
/// transition. Null leaves the role unknown ⇒ default-emit (single-node deploys + tests).</param>
public ScriptedAlarmHostActor(
IActorRef publishActor,
IActorRef? mux,
DependencyMuxTagUpstreamSource upstream,
ScriptedAlarmEngine engine)
ScriptedAlarmEngine engine,
NodeId? localNode = null)
{
ArgumentNullException.ThrowIfNull(publishActor);
ArgumentNullException.ThrowIfNull(upstream);
@@ -143,6 +160,7 @@ public sealed class ScriptedAlarmHostActor : ReceiveActor
_mux = mux;
_upstream = upstream;
_engine = engine;
_localNode = localNode;
// OnEvent fires on the engine's worker thread. NEVER touch Context / actor state here —
// marshal onto the actor thread via the thread-safe Self.Tell. Keep the handler in a field
@@ -155,6 +173,9 @@ public sealed class ScriptedAlarmHostActor : ReceiveActor
Receive<AlarmsLoaded>(OnAlarmsLoaded);
Receive<VirtualTagActor.DependencyValueChanged>(OnDependencyChanged);
Receive<EngineEmission>(OnEngineEmission);
// Cluster redundancy snapshots (published on the `redundancy-state` topic, subscribed in PreStart)
// cache this node's role so OnEngineEmission can gate the cluster-wide alerts publish to the Primary.
Receive<RedundancyStateChanged>(OnRedundancyStateChanged);
// Inbound OPC UA Part 9 alarm method calls arrive as AlarmCommands on the cluster
// `alarm-commands` DPS topic (T18 publishes them after the AlarmAck role gate). The topic is a
// cluster-wide broadcast — every host node receives every command — so OnAlarmCommand filters to
@@ -275,9 +296,37 @@ public sealed class ScriptedAlarmHostActor : ReceiveActor
User: TransitionUser(e),
TimestampUtc: e.TimestampUtc);
// Warm-standby dedup: only the Primary (driver-role leader) publishes the cluster-wide
// transition. Default-emit until told we are Secondary/Detached so single-node deploys + the
// boot window never drop transitions. The OPC UA node write above + inbound command processing
// stay ungated (the secondary keeps its address space + engine state warm for failover).
if (_localRole is RedundancyRole.Secondary or RedundancyRole.Detached)
{
return;
}
_mediator.Tell(new Publish(AlertsTopic, evt));
}
/// <summary>Caches this node's <see cref="RedundancyRole"/> from a cluster redundancy snapshot so
/// <see cref="OnEngineEmission"/> can gate the cluster-wide <c>alerts</c> publish to the Primary. A
/// snapshot that doesn't mention <see cref="_localNode"/> (or no local node wired) leaves the cached
/// role unchanged ⇒ default-emit. Mirrors <see cref="OpcUaPublishActor"/>'s handler.</summary>
/// <param name="msg">The cluster redundancy snapshot.</param>
private void OnRedundancyStateChanged(RedundancyStateChanged msg)
{
if (_localNode is null)
{
return;
}
var local = msg.Nodes.FirstOrDefault(n => n.NodeId == _localNode.Value);
if (local is not null)
{
_localRole = local.Role;
}
}
/// <summary>
/// Drives an inbound OPC UA Part 9 alarm method call (delivered as an <see cref="AlarmCommand"/>
/// on the cluster <c>alarm-commands</c> topic) onto the matching <see cref="ScriptedAlarmEngine"/>
@@ -386,6 +435,10 @@ public sealed class ScriptedAlarmHostActor : ReceiveActor
// the node manager's condition handlers, T18) land here as AlarmCommands. The Subscribe is sent
// from Self so the SubscribeAck returns to this actor (handled as a no-op in the ctor wiring).
_mediator.Tell(new Subscribe(AlarmCommandsTopic, Self));
// Also subscribe to the `redundancy-state` topic so cluster role changes land as
// RedundancyStateChanged and cache this node's role — OnEngineEmission gates the cluster-wide
// alerts publish to the Primary so a 2-node single-cluster deploy doesn't double-emit transitions.
_mediator.Tell(new Subscribe(OpcUaPublishActor.RedundancyStateTopic, Self));
base.PreStart();
}