feat(redundancy): gate scripted-alarm alerts publish on Primary (A1)

This commit is contained in:
Joseph Doherty
2026-06-11 08:44:44 -04:00
parent 535787bd85
commit 06c415598c
3 changed files with 186 additions and 6 deletions
@@ -5,7 +5,9 @@ using Serilog;
using Shouldly;
using Xunit;
using ZB.MOM.WW.OtOpcUa.Commons.Messages.Alerts;
using ZB.MOM.WW.OtOpcUa.Commons.Messages.Redundancy;
using ZB.MOM.WW.OtOpcUa.Commons.OpcUa;
using ZB.MOM.WW.OtOpcUa.Commons.Types;
using ZB.MOM.WW.OtOpcUa.Core.ScriptedAlarms;
using ZB.MOM.WW.OtOpcUa.Core.Scripting;
using ZB.MOM.WW.OtOpcUa.OpcUaServer;
@@ -74,15 +76,33 @@ public sealed class ScriptedAlarmHostActorTests : RuntimeActorTestBase
return new ScriptedAlarmEngine(upstream, new InMemoryAlarmStateStore(), new ScriptLoggerFactory(logger), logger);
}
/// <summary>The local node id used by the redundancy-gating tests.</summary>
private static readonly NodeId LocalNode = new("node-A");
private (IActorRef Host, DependencyMuxTagUpstreamSource Upstream) Spawn(
TestProbe publish, TestProbe mux)
TestProbe publish, TestProbe mux, NodeId? localNode = null)
{
var upstream = new DependencyMuxTagUpstreamSource();
var engine = BuildEngine(upstream);
var host = Sys.ActorOf(ScriptedAlarmHostActor.Props(publish.Ref, mux.Ref, upstream, engine));
var host = Sys.ActorOf(ScriptedAlarmHostActor.Props(publish.Ref, mux.Ref, upstream, engine, localNode));
return (host, upstream);
}
/// <summary>Tell the host a <see cref="RedundancyStateChanged"/> snapshot marking
/// <see cref="LocalNode"/> with <paramref name="role"/> so the gate observes the local role.</summary>
private static void TellRedundancyRole(IActorRef host, RedundancyRole role) =>
host.Tell(new RedundancyStateChanged(
new[]
{
new NodeRedundancyState(
NodeId: LocalNode,
Role: role,
IsClusterLeader: role == RedundancyRole.Primary,
IsRoleLeaderForDriver: role == RedundancyRole.Primary,
AsOfUtc: DateTime.UtcNow),
},
CorrelationId.NewId()));
/// <summary>Subscribe <paramref name="probe"/> to the <c>alerts</c> DPS topic and wait for the ack.
/// The Subscribe is sent FROM the probe so the SubscribeAck returns to it.</summary>
private void SubscribeToAlerts(TestProbe probe)
@@ -381,4 +401,111 @@ public sealed class ScriptedAlarmHostActorTests : RuntimeActorTestBase
var state = publish.FishForMessage<OpcUaPublishActor.AlarmStateUpdate>(m => m.State.Active, Timeout);
state.AlarmNodeId.ShouldBe("alm-1");
}
/// <summary>Default-emit (T1): before ANY RedundancyStateChanged snapshot arrives — the boot window,
/// and the steady state for single-node deploys (the sole node is always Primary) — the host MUST
/// publish the cluster-wide alerts transition. Constructed WITH a localNode but no snapshot sent, so
/// the cached local role is unknown ⇒ treated as Primary/emit.</summary>
[Fact]
public void Emission_is_published_to_alerts_by_default_before_any_redundancy_state()
{
var publish = CreateTestProbe();
var mux = CreateTestProbe();
var alerts = CreateTestProbe();
SubscribeToAlerts(alerts);
var (host, _) = Spawn(publish, mux, LocalNode);
host.Tell(new ScriptedAlarmHostActor.ApplyScriptedAlarms(new[] { Plan(severity: 800) }));
mux.ExpectMsg<DependencyMuxActor.RegisterInterest>(Timeout); // load completed
// No RedundancyStateChanged sent — local role unknown ⇒ default-emit.
host.Tell(new VirtualTagActor.DependencyValueChanged("M.T", 99, DateTime.UtcNow));
// The OPC UA node write happens AND the alerts transition is published.
publish.FishForMessage<OpcUaPublishActor.AlarmStateUpdate>(m => m.State.Active, Timeout);
var evt = alerts.ExpectMsg<AlarmTransitionEvent>(Timeout);
evt.AlarmId.ShouldBe("alm-1");
evt.TransitionKind.ShouldBe("Activated");
}
/// <summary>Secondary suppression (T1): when the cached local role is Secondary, the host MUST NOT
/// publish the cluster-wide alerts transition (the Primary publishes the single copy) — but it MUST
/// still write the local OPC UA condition node so the secondary's address space stays warm for failover.</summary>
[Fact]
public void Secondary_node_suppresses_alerts_publish_but_still_writes_opcua()
{
var publish = CreateTestProbe();
var mux = CreateTestProbe();
var alerts = CreateTestProbe();
SubscribeToAlerts(alerts);
var (host, _) = Spawn(publish, mux, LocalNode);
host.Tell(new ScriptedAlarmHostActor.ApplyScriptedAlarms(new[] { Plan(severity: 800) }));
mux.ExpectMsg<DependencyMuxActor.RegisterInterest>(Timeout); // load completed
// Mark this node Secondary, then activate.
TellRedundancyRole(host, RedundancyRole.Secondary);
host.Tell(new VirtualTagActor.DependencyValueChanged("M.T", 99, DateTime.UtcNow));
// The local OPC UA node write is UNGATED — it must still arrive.
var state = publish.FishForMessage<OpcUaPublishActor.AlarmStateUpdate>(m => m.State.Active, Timeout);
state.AlarmNodeId.ShouldBe("alm-1");
// The cluster-wide alerts publish is gated off on the secondary.
alerts.ExpectNoMsg(TimeSpan.FromMilliseconds(500));
}
/// <summary>Primary publishes (T1): when the cached local role is Primary, the host publishes the
/// cluster-wide alerts transition as normal (this is the single copy the fleet sees).</summary>
[Fact]
public void Primary_node_publishes_alerts()
{
var publish = CreateTestProbe();
var mux = CreateTestProbe();
var alerts = CreateTestProbe();
SubscribeToAlerts(alerts);
var (host, _) = Spawn(publish, mux, LocalNode);
host.Tell(new ScriptedAlarmHostActor.ApplyScriptedAlarms(new[] { Plan(severity: 800) }));
mux.ExpectMsg<DependencyMuxActor.RegisterInterest>(Timeout); // load completed
// Mark this node Primary, then activate.
TellRedundancyRole(host, RedundancyRole.Primary);
host.Tell(new VirtualTagActor.DependencyValueChanged("M.T", 99, DateTime.UtcNow));
publish.FishForMessage<OpcUaPublishActor.AlarmStateUpdate>(m => m.State.Active, Timeout);
var evt = alerts.ExpectMsg<AlarmTransitionEvent>(Timeout);
evt.AlarmId.ShouldBe("alm-1");
evt.TransitionKind.ShouldBe("Activated");
}
/// <summary>Inbound command ungated by role (T1): the alerts-publish gate must NOT affect inbound
/// command processing. Under a Secondary role, an AlarmCommand("Acknowledge") for an owned, active
/// alarm still drives the engine — observed via the resulting AlarmStateUpdate(Acknowledged=true)
/// (the OPC UA node write is ungated so the secondary's engine state + address space stay consistent).</summary>
[Fact]
public void Inbound_AlarmCommand_is_processed_regardless_of_role()
{
var publish = CreateTestProbe();
var mux = CreateTestProbe();
var (host, _) = Spawn(publish, mux, LocalNode);
host.Tell(new ScriptedAlarmHostActor.ApplyScriptedAlarms(new[] { Plan(id: "alm-1", depRef: "M.T") }));
mux.ExpectMsg<DependencyMuxActor.RegisterInterest>(Timeout); // load completed
// Mark this node Secondary — the alerts publish is gated, but command processing is NOT.
TellRedundancyRole(host, RedundancyRole.Secondary);
// Activate so there is something to acknowledge.
host.Tell(new VirtualTagActor.DependencyValueChanged("M.T", 99, DateTime.UtcNow));
publish.FishForMessage<OpcUaPublishActor.AlarmStateUpdate>(m => m.State.Active && !m.State.Acknowledged, Timeout);
// Acknowledge via the command topic — the engine must process it even on the secondary.
host.Tell(new AlarmCommand(
AlarmId: "alm-1", Operation: "Acknowledge", User: "alice", Comment: "ack-note", UnshelveAtUtc: null));
var acked = publish.FishForMessage<OpcUaPublishActor.AlarmStateUpdate>(m => m.State.Acknowledged, Timeout);
acked.AlarmNodeId.ShouldBe("alm-1");
acked.State.Acknowledged.ShouldBeTrue();
}
}