worker(alarms): FailoverAlarmConsumer auto-failover/failback state machine

This commit is contained in:
Joseph Doherty
2026-06-13 09:46:47 -04:00
parent fd64b9260c
commit 0a54c0bc4b
4 changed files with 640 additions and 0 deletions
@@ -0,0 +1,211 @@
using System;
using System.Collections.Generic;
using ZB.MOM.WW.MxGateway.Contracts.Proto;
using ZB.MOM.WW.MxGateway.Worker.MxAccess;
using Xunit;
namespace ZB.MOM.WW.MxGateway.Worker.Tests.MxAccess;
/// <summary>
/// Unit tests for <see cref="FailoverAlarmConsumer"/>: prove the
/// auto-failover (consecutive primary COM failures → standby) and
/// auto-failback (consecutive clean probes → primary) state machine,
/// active-child transition forwarding, and active-child delegation of
/// acknowledgments. Fakes stand in for both children so this needs no
/// AVEVA install.
/// </summary>
public sealed class FailoverAlarmConsumerTests
{
/// <summary>
/// Primary fake whose Subscribe/PollOnce throw a COMException while
/// <see cref="ThrowOnPoll"/> is set, modeling a wnwrap consumer that
/// surfaces COM HRESULT failures. Can also re-raise a transition so
/// before-failover forwarding can be exercised.
/// </summary>
private sealed class FlakyPrimary : IMxAccessAlarmConsumer
{
public event EventHandler<MxAlarmTransitionEvent>? AlarmTransitionEmitted;
public bool ThrowOnPoll = true;
public int Polls;
public void Subscribe(string s)
{
if (ThrowOnPoll)
{
throw new System.Runtime.InteropServices.COMException("boom", unchecked((int)0x80004005));
}
}
public void PollOnce()
{
Polls++;
if (ThrowOnPoll)
{
throw new System.Runtime.InteropServices.COMException("boom", unchecked((int)0x80004005));
}
}
public int AcknowledgeByGuid(Guid g, string c, string a, string b, string d, string e) => 11;
public int AcknowledgeByName(string n, string p, string gr, string c, string a, string b, string d, string e) => 11;
public IReadOnlyList<MxAlarmSnapshotRecord> SnapshotActiveAlarms() => Array.Empty<MxAlarmSnapshotRecord>();
public void Dispose() { }
public void Raise(MxAlarmTransitionEvent e) => AlarmTransitionEmitted?.Invoke(this, e);
}
/// <summary>
/// Standby fake (subtag stand-in): never throws, records that it was
/// armed, and can re-raise a transition.
/// </summary>
private sealed class StubStandby : IMxAccessAlarmConsumer
{
public event EventHandler<MxAlarmTransitionEvent>? AlarmTransitionEmitted;
public bool Subscribed;
public void Subscribe(string s) => Subscribed = true;
public void PollOnce() { }
public int AcknowledgeByGuid(Guid g, string c, string a, string b, string d, string e) => 22;
public int AcknowledgeByName(string n, string p, string gr, string c, string a, string b, string d, string e) => 22;
public IReadOnlyList<MxAlarmSnapshotRecord> SnapshotActiveAlarms() => Array.Empty<MxAlarmSnapshotRecord>();
public void Dispose() { }
public void Raise(MxAlarmTransitionEvent e) => AlarmTransitionEmitted?.Invoke(this, e);
}
private static MxAlarmTransitionEvent SampleTransition() => new MxAlarmTransitionEvent
{
Record = new MxAlarmSnapshotRecord { AlarmGuid = Guid.NewGuid() },
PreviousState = MxAlarmStateKind.Unspecified,
};
[Fact]
public void Primary_FailsThresholdTimes_SwitchesToSubtag()
{
FlakyPrimary primary = new FlakyPrimary { ThrowOnPoll = true };
StubStandby standby = new StubStandby();
FailoverSettings settings = new FailoverSettings(threshold: 3, probeIntervalSeconds: 0, stableProbes: 1);
using FailoverAlarmConsumer sut = new FailoverAlarmConsumer(primary, standby, settings);
List<AlarmProviderModeChange> changes = new List<AlarmProviderModeChange>();
sut.ProviderModeChanged += (_, e) => changes.Add(e);
sut.Subscribe(@"\\HOST\Galaxy!Area"); // failure 1 (primary), standby armed
Assert.True(standby.Subscribed);
Assert.Empty(changes);
sut.PollOnce(); // failure 2
Assert.Empty(changes);
sut.PollOnce(); // failure 3 → switch
Assert.Single(changes);
Assert.Equal(AlarmProviderMode.Subtag, changes[0].Mode);
Assert.Equal(AlarmProviderMode.Subtag, sut.Mode);
Assert.Equal(unchecked((int)0x80004005), changes[0].HResult);
}
[Fact]
public void AfterSwitch_StandbyTransitionsAreForwarded()
{
FlakyPrimary primary = new FlakyPrimary { ThrowOnPoll = true };
StubStandby standby = new StubStandby();
FailoverSettings settings = new FailoverSettings(threshold: 1, probeIntervalSeconds: 0, stableProbes: 1);
using FailoverAlarmConsumer sut = new FailoverAlarmConsumer(primary, standby, settings);
MxAlarmTransitionEvent? forwarded = null;
sut.AlarmTransitionEmitted += (_, e) => forwarded = e;
sut.Subscribe(@"\\HOST\Galaxy!Area"); // threshold=1 → switch to Subtag immediately
Assert.Equal(AlarmProviderMode.Subtag, sut.Mode);
MxAlarmTransitionEvent transition = SampleTransition();
standby.Raise(transition);
Assert.Same(transition, forwarded);
}
[Fact]
public void WhileDegraded_PrimaryHeals_FailsBackAfterStableProbes()
{
FlakyPrimary primary = new FlakyPrimary { ThrowOnPoll = true };
StubStandby standby = new StubStandby();
FailoverSettings settings = new FailoverSettings(threshold: 1, probeIntervalSeconds: 0, stableProbes: 2);
using FailoverAlarmConsumer sut = new FailoverAlarmConsumer(primary, standby, settings);
List<AlarmProviderModeChange> changes = new List<AlarmProviderModeChange>();
sut.ProviderModeChanged += (_, e) => changes.Add(e);
sut.Subscribe(@"\\HOST\Galaxy!Area"); // threshold=1 → Subtag (change 1)
Assert.Single(changes);
Assert.Equal(AlarmProviderMode.Subtag, changes[^1].Mode);
primary.ThrowOnPoll = false; // primary heals
sut.ProbeOnce(); // clean 1 (no failback yet)
Assert.Single(changes);
sut.ProbeOnce(); // clean 2 → failback
Assert.Equal(2, changes.Count);
Assert.Equal(AlarmProviderMode.Alarmmgr, changes[^1].Mode);
Assert.Equal(AlarmProviderMode.Alarmmgr, sut.Mode);
Assert.Equal(0, changes[^1].HResult);
}
[Fact]
public void BeforeFailover_PrimaryTransitionsAreForwarded()
{
FlakyPrimary primary = new FlakyPrimary { ThrowOnPoll = false }; // healthy, can Raise
StubStandby standby = new StubStandby();
FailoverSettings settings = new FailoverSettings(threshold: 3, probeIntervalSeconds: 0, stableProbes: 1);
using FailoverAlarmConsumer sut = new FailoverAlarmConsumer(primary, standby, settings);
List<MxAlarmTransitionEvent> forwarded = new List<MxAlarmTransitionEvent>();
sut.AlarmTransitionEmitted += (_, e) => forwarded.Add(e);
sut.Subscribe(@"\\HOST\Galaxy!Area");
Assert.Equal(AlarmProviderMode.Alarmmgr, sut.Mode);
MxAlarmTransitionEvent fromPrimary = SampleTransition();
primary.Raise(fromPrimary); // active=Primary → forwarded
Assert.Single(forwarded);
Assert.Same(fromPrimary, forwarded[0]);
standby.Raise(SampleTransition()); // standby not active → suppressed
Assert.Single(forwarded);
}
[Fact]
public void Acknowledge_DelegatesToActiveChild()
{
FlakyPrimary primary = new FlakyPrimary { ThrowOnPoll = false };
StubStandby standby = new StubStandby();
FailoverSettings settings = new FailoverSettings(threshold: 1, probeIntervalSeconds: 0, stableProbes: 1);
using FailoverAlarmConsumer sut = new FailoverAlarmConsumer(primary, standby, settings);
sut.Subscribe(@"\\HOST\Galaxy!Area");
// Active = Primary → primary's sentinel value (11).
Assert.Equal(11, sut.AcknowledgeByGuid(Guid.NewGuid(), "c", "n", "node", "dom", "full"));
Assert.Equal(11, sut.AcknowledgeByName("a", "p", "g", "c", "n", "node", "dom", "full"));
// Force a failover by failing the primary past threshold.
primary.ThrowOnPoll = true;
sut.PollOnce(); // threshold=1 → switch to Standby
Assert.Equal(AlarmProviderMode.Subtag, sut.Mode);
// Active = Standby → standby's sentinel value (22).
Assert.Equal(22, sut.AcknowledgeByGuid(Guid.NewGuid(), "c", "n", "node", "dom", "full"));
Assert.Equal(22, sut.AcknowledgeByName("a", "p", "g", "c", "n", "node", "dom", "full"));
}
}
@@ -0,0 +1,61 @@
using System;
using ZB.MOM.WW.MxGateway.Contracts.Proto;
namespace ZB.MOM.WW.MxGateway.Worker.MxAccess;
/// <summary>
/// Raised by <see cref="FailoverAlarmConsumer"/> every time the active
/// alarm source switches between the primary (alarmmgr) consumer and the
/// standby (subtag) consumer. The worker translates this into the proto
/// family <c>OnAlarmProviderModeChanged</c> so connected gateway clients
/// can surface the degraded/recovered state.
/// </summary>
/// <remarks>
/// <para>
/// Plain class with constructor-assigned get-only properties — not a
/// <c>record</c> or <c>init</c>-only type — because the worker
/// multi-targets .NET Framework 4.8, which lacks
/// <c>System.Runtime.CompilerServices.IsExternalInit</c> (CS0518).
/// </para>
/// </remarks>
public sealed class AlarmProviderModeChange : EventArgs
{
/// <summary>
/// Initializes the change event payload.
/// </summary>
/// <param name="mode">The provider mode now active after the switch.</param>
/// <param name="reason">Human-readable reason for the switch.</param>
/// <param name="hResult">
/// The COM HRESULT that triggered a failover, or 0 for a clean
/// failback / no associated HRESULT.
/// </param>
/// <param name="atUtc">The UTC instant the switch occurred.</param>
public AlarmProviderModeChange(AlarmProviderMode mode, string reason, int hResult, DateTime atUtc)
{
Mode = mode;
Reason = reason ?? string.Empty;
HResult = hResult;
AtUtc = atUtc;
}
/// <summary>
/// The provider mode now active after the switch.
/// </summary>
public AlarmProviderMode Mode { get; }
/// <summary>
/// Human-readable reason for the switch (e.g. the failing COM call, or
/// <c>"recovered"</c> for a failback).
/// </summary>
public string Reason { get; }
/// <summary>
/// The COM HRESULT that triggered a failover, or 0 when none applies.
/// </summary>
public int HResult { get; }
/// <summary>
/// The UTC instant the switch occurred.
/// </summary>
public DateTime AtUtc { get; }
}
@@ -0,0 +1,311 @@
using System;
using System.Collections.Generic;
using System.Runtime.InteropServices;
using ZB.MOM.WW.MxGateway.Contracts.Proto;
namespace ZB.MOM.WW.MxGateway.Worker.MxAccess;
/// <summary>
/// Composite <see cref="IMxAccessAlarmConsumer"/> that owns a PRIMARY
/// consumer (the wnwrap <see cref="WnWrapAlarmConsumer"/> alarmmgr source)
/// and a STANDBY consumer (the <c>SubtagAlarmConsumer</c> subtag fallback),
/// and switches between them automatically:
/// <list type="bullet">
/// <item><description>
/// Auto-fails-over to standby after
/// <see cref="FailoverSettings.Threshold"/> consecutive COM
/// failures on the primary.
/// </description></item>
/// <item><description>
/// Auto-fails-back to primary after
/// <see cref="FailoverSettings.StableProbes"/> consecutive clean
/// failback probes against the recovering primary.
/// </description></item>
/// </list>
/// It re-raises <see cref="AlarmTransitionEmitted"/> from whichever child
/// is active and raises <see cref="ProviderModeChanged"/> on every switch.
/// </summary>
/// <remarks>
/// <para>
/// <strong>Active-child event forwarding.</strong> This type subscribes
/// to <em>both</em> children's <see cref="AlarmTransitionEmitted"/>
/// events up front and gates re-raising on identity: a child transition
/// is forwarded only when its <c>sender</c> is the currently active
/// child. The standby is armed (subscribed) from the start so its
/// snapshot is warm at the moment of failover, but its transitions stay
/// suppressed until it becomes active. Gating-by-active is simpler and
/// less error-prone than subscribe/unsubscribe churn on every switch,
/// and it avoids a race where a transition fires during the switch.
/// </para>
/// <para>
/// <strong>Threading.</strong> Like its children, this type is driven
/// entirely on the worker's STA: <see cref="Subscribe"/>,
/// <see cref="PollOnce"/>, <see cref="ProbeOnce"/>, and the
/// <c>AcknowledgeBy*</c> calls are all invoked from the apartment that
/// owns the underlying COM objects. It owns no locks of its own and no
/// internal timer; the worker drives <see cref="PollOnce"/> on a timer.
/// </para>
/// </remarks>
public sealed class FailoverAlarmConsumer : IMxAccessAlarmConsumer
{
private enum Active
{
Primary,
Standby,
}
private readonly IMxAccessAlarmConsumer primary;
private readonly IMxAccessAlarmConsumer standby;
private readonly FailoverSettings settings;
private Active active = Active.Primary;
private AlarmProviderMode mode = AlarmProviderMode.Alarmmgr;
private int consecutiveFailures;
private int cleanProbes;
private bool disposed;
/// <summary>
/// Composes the failover consumer over its two children.
/// </summary>
/// <param name="primary">The PRIMARY (alarmmgr) consumer.</param>
/// <param name="standby">The STANDBY (subtag) consumer.</param>
/// <param name="settings">The failover/failback tunables.</param>
public FailoverAlarmConsumer(
IMxAccessAlarmConsumer primary,
IMxAccessAlarmConsumer standby,
FailoverSettings settings)
{
this.primary = primary ?? throw new ArgumentNullException(nameof(primary));
this.standby = standby ?? throw new ArgumentNullException(nameof(standby));
this.settings = settings ?? throw new ArgumentNullException(nameof(settings));
this.primary.AlarmTransitionEmitted += OnChildTransition;
this.standby.AlarmTransitionEmitted += OnChildTransition;
}
/// <inheritdoc />
public event EventHandler<MxAlarmTransitionEvent>? AlarmTransitionEmitted;
/// <summary>
/// Fires on every switch between primary and standby. Carries the new
/// <see cref="AlarmProviderMode"/>, the reason, the triggering HRESULT
/// (0 for a clean failback), and the UTC instant.
/// </summary>
public event EventHandler<AlarmProviderModeChange>? ProviderModeChanged;
/// <summary>
/// The provider mode currently active.
/// </summary>
public AlarmProviderMode Mode => mode;
/// <inheritdoc />
/// <remarks>
/// Arms BOTH children up front so the standby snapshot is warm at the
/// moment of failover. The standby is always subscribed even if the
/// primary's <c>Subscribe</c> throws; a standby subscribe failure is
/// surfaced (rethrown) but does not count toward primary failover. The
/// primary subscribe runs through the failure-counting wrapper so a
/// COM failure on subscribe contributes to the failover threshold.
/// </remarks>
public void Subscribe(string subscription)
{
if (disposed) throw new ObjectDisposedException(nameof(FailoverAlarmConsumer));
// Arm the standby first so it is warm regardless of primary outcome.
// A standby subscribe failure is a hard fault (the fallback itself is
// broken) and is surfaced to the caller; it does not feed the primary
// failover counter.
standby.Subscribe(subscription);
// Drive the primary subscribe through the failure-counting wrapper so
// a COM failure here counts toward the failover threshold instead of
// escaping. Swallowing the exception is deliberate: the standby is
// already armed, so a failed primary subscribe just nudges the state
// machine toward (or into) standby rather than aborting startup.
RunPrimary(() => primary.Subscribe(subscription), "Subscribe");
}
/// <inheritdoc />
/// <remarks>
/// While the primary is active, drives <c>primary.PollOnce</c> through
/// the failure-counting wrapper. While degraded (standby active),
/// drives <c>standby.PollOnce</c> and then runs one failback probe per
/// call via <see cref="ProbeOnce"/> — the worker drives this on a
/// timer, so one degraded poll equals one probe tick.
/// </remarks>
public void PollOnce()
{
if (disposed) throw new ObjectDisposedException(nameof(FailoverAlarmConsumer));
if (active == Active.Primary)
{
RunPrimary(() => primary.PollOnce(), "PollOnce");
return;
}
// Degraded: pump the standby for live transitions, then probe the
// primary for recovery. Standby PollOnce is a no-op for the subtag
// consumer but kept for symmetry / future standby sources.
standby.PollOnce();
ProbeOnce();
}
/// <summary>
/// Runs one failback probe against the (presumed recovering) primary.
/// Only meaningful while the standby is active; a no-op otherwise.
/// </summary>
/// <remarks>
/// A clean probe (primary <c>Subscribe</c> + <c>PollOnce</c> both
/// succeed) increments the clean-probe counter and, once it reaches
/// <see cref="FailoverSettings.StableProbes"/>, fails back to the
/// primary. Any probe failure resets the clean-probe counter to 0 so
/// the consumer requires a fresh unbroken run before failing back.
/// Exposed publicly so tests (and any external scheduler honoring
/// <see cref="FailoverSettings.ProbeIntervalSeconds"/> cadence) can
/// drive it directly.
/// </remarks>
public void ProbeOnce()
{
if (disposed) throw new ObjectDisposedException(nameof(FailoverAlarmConsumer));
if (active != Active.Standby) return;
try
{
primary.Subscribe(string.Empty);
primary.PollOnce();
}
catch (Exception)
{
// Probe failed — the primary is still unhealthy. Demand a fresh
// unbroken run of StableProbes clean polls before failing back.
cleanProbes = 0;
return;
}
cleanProbes++;
if (cleanProbes >= settings.StableProbes)
{
SwitchToPrimary("recovered", 0);
}
}
/// <inheritdoc />
public int AcknowledgeByGuid(
Guid alarmGuid,
string ackComment,
string ackOperatorName,
string ackOperatorNode,
string ackOperatorDomain,
string ackOperatorFullName)
{
if (disposed) throw new ObjectDisposedException(nameof(FailoverAlarmConsumer));
return ActiveChild.AcknowledgeByGuid(
alarmGuid, ackComment, ackOperatorName, ackOperatorNode, ackOperatorDomain, ackOperatorFullName);
}
/// <inheritdoc />
public int AcknowledgeByName(
string alarmName,
string providerName,
string groupName,
string ackComment,
string ackOperatorName,
string ackOperatorNode,
string ackOperatorDomain,
string ackOperatorFullName)
{
if (disposed) throw new ObjectDisposedException(nameof(FailoverAlarmConsumer));
return ActiveChild.AcknowledgeByName(
alarmName, providerName, groupName, ackComment,
ackOperatorName, ackOperatorNode, ackOperatorDomain, ackOperatorFullName);
}
/// <inheritdoc />
public IReadOnlyList<MxAlarmSnapshotRecord> SnapshotActiveAlarms()
{
if (disposed) throw new ObjectDisposedException(nameof(FailoverAlarmConsumer));
return ActiveChild.SnapshotActiveAlarms();
}
private IMxAccessAlarmConsumer ActiveChild => active == Active.Primary ? primary : standby;
/// <summary>
/// Runs a primary COM action, counting consecutive failures. A
/// <see cref="COMException"/> (or any exception, treated as a COM
/// failure) increments the failure counter and, at
/// <see cref="FailoverSettings.Threshold"/> while the primary is still
/// active, switches to the standby. A success resets the counter.
/// </summary>
private void RunPrimary(Action action, string operation)
{
try
{
action();
}
catch (Exception ex)
{
consecutiveFailures++;
int hresult = ex is COMException ? ex.HResult : 0;
if (active == Active.Primary && consecutiveFailures >= settings.Threshold)
{
SwitchToStandby($"primary {operation} failed", hresult);
}
return;
}
consecutiveFailures = 0;
}
private void SwitchToStandby(string reason, int hresult)
{
active = Active.Standby;
mode = AlarmProviderMode.Subtag;
consecutiveFailures = 0;
cleanProbes = 0;
// Warm the standby snapshot for the gateway hand-off. The gateway
// reconciles state from this snapshot, so the return value is not
// consumed here — the call exists for its priming side effect.
_ = standby.SnapshotActiveAlarms();
RaiseModeChanged(AlarmProviderMode.Subtag, reason, hresult);
}
private void SwitchToPrimary(string reason, int hresult)
{
active = Active.Primary;
mode = AlarmProviderMode.Alarmmgr;
consecutiveFailures = 0;
cleanProbes = 0;
RaiseModeChanged(AlarmProviderMode.Alarmmgr, reason, hresult);
}
private void RaiseModeChanged(AlarmProviderMode newMode, string reason, int hresult)
{
ProviderModeChanged?.Invoke(
this,
new AlarmProviderModeChange(newMode, reason, hresult, DateTime.UtcNow));
}
private void OnChildTransition(object? sender, MxAlarmTransitionEvent e)
{
// Gate by active child: forward only the active source's transitions.
if (ReferenceEquals(sender, ActiveChild))
{
AlarmTransitionEmitted?.Invoke(this, e);
}
}
/// <inheritdoc />
public void Dispose()
{
if (disposed) return;
disposed = true;
primary.AlarmTransitionEmitted -= OnChildTransition;
standby.AlarmTransitionEmitted -= OnChildTransition;
primary.Dispose();
standby.Dispose();
}
}
@@ -0,0 +1,57 @@
namespace ZB.MOM.WW.MxGateway.Worker.MxAccess;
/// <summary>
/// Tunables for <see cref="FailoverAlarmConsumer"/>'s auto-failover /
/// auto-failback state machine. Constructor-clamped to safe minimums so a
/// misconfigured options bind can never produce a zero/negative threshold
/// that would either never fail over or fail over on the first hiccup
/// unintentionally.
/// </summary>
/// <remarks>
/// <para>
/// Plain class with constructor-assigned get-only properties — not a
/// <c>record</c> or <c>init</c>-only type — because the worker
/// multi-targets .NET Framework 4.8, which lacks
/// <c>System.Runtime.CompilerServices.IsExternalInit</c> and so cannot
/// compile <c>init</c> accessors or positional records (CS0518).
/// </para>
/// </remarks>
public sealed class FailoverSettings
{
/// <summary>
/// Initializes the settings, clamping each value to its safe minimum.
/// </summary>
/// <param name="threshold">
/// Consecutive primary COM failures that trigger a switch to standby.
/// Clamped to a minimum of 1.
/// </param>
/// <param name="probeIntervalSeconds">
/// Minimum spacing (seconds) between failback probes against the
/// recovering primary. Clamped to a minimum of 0 (probe every tick).
/// </param>
/// <param name="stableProbes">
/// Consecutive clean failback probes required before switching back to
/// the primary. Clamped to a minimum of 1.
/// </param>
public FailoverSettings(int threshold, int probeIntervalSeconds, int stableProbes)
{
Threshold = threshold < 1 ? 1 : threshold;
ProbeIntervalSeconds = probeIntervalSeconds < 0 ? 0 : probeIntervalSeconds;
StableProbes = stableProbes < 1 ? 1 : stableProbes;
}
/// <summary>
/// Consecutive primary COM failures that trigger a switch to standby.
/// </summary>
public int Threshold { get; }
/// <summary>
/// Minimum spacing, in seconds, between failback probes.
/// </summary>
public int ProbeIntervalSeconds { get; }
/// <summary>
/// Consecutive clean failback probes required before failing back.
/// </summary>
public int StableProbes { get; }
}