worker(alarms): FailoverAlarmConsumer auto-failover/failback state machine
This commit is contained in:
@@ -0,0 +1,211 @@
|
|||||||
|
using System;
|
||||||
|
using System.Collections.Generic;
|
||||||
|
using ZB.MOM.WW.MxGateway.Contracts.Proto;
|
||||||
|
using ZB.MOM.WW.MxGateway.Worker.MxAccess;
|
||||||
|
using Xunit;
|
||||||
|
|
||||||
|
namespace ZB.MOM.WW.MxGateway.Worker.Tests.MxAccess;
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Unit tests for <see cref="FailoverAlarmConsumer"/>: prove the
|
||||||
|
/// auto-failover (consecutive primary COM failures → standby) and
|
||||||
|
/// auto-failback (consecutive clean probes → primary) state machine,
|
||||||
|
/// active-child transition forwarding, and active-child delegation of
|
||||||
|
/// acknowledgments. Fakes stand in for both children so this needs no
|
||||||
|
/// AVEVA install.
|
||||||
|
/// </summary>
|
||||||
|
public sealed class FailoverAlarmConsumerTests
|
||||||
|
{
|
||||||
|
/// <summary>
|
||||||
|
/// Primary fake whose Subscribe/PollOnce throw a COMException while
|
||||||
|
/// <see cref="ThrowOnPoll"/> is set, modeling a wnwrap consumer that
|
||||||
|
/// surfaces COM HRESULT failures. Can also re-raise a transition so
|
||||||
|
/// before-failover forwarding can be exercised.
|
||||||
|
/// </summary>
|
||||||
|
private sealed class FlakyPrimary : IMxAccessAlarmConsumer
|
||||||
|
{
|
||||||
|
public event EventHandler<MxAlarmTransitionEvent>? AlarmTransitionEmitted;
|
||||||
|
|
||||||
|
public bool ThrowOnPoll = true;
|
||||||
|
public int Polls;
|
||||||
|
|
||||||
|
public void Subscribe(string s)
|
||||||
|
{
|
||||||
|
if (ThrowOnPoll)
|
||||||
|
{
|
||||||
|
throw new System.Runtime.InteropServices.COMException("boom", unchecked((int)0x80004005));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void PollOnce()
|
||||||
|
{
|
||||||
|
Polls++;
|
||||||
|
if (ThrowOnPoll)
|
||||||
|
{
|
||||||
|
throw new System.Runtime.InteropServices.COMException("boom", unchecked((int)0x80004005));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public int AcknowledgeByGuid(Guid g, string c, string a, string b, string d, string e) => 11;
|
||||||
|
|
||||||
|
public int AcknowledgeByName(string n, string p, string gr, string c, string a, string b, string d, string e) => 11;
|
||||||
|
|
||||||
|
public IReadOnlyList<MxAlarmSnapshotRecord> SnapshotActiveAlarms() => Array.Empty<MxAlarmSnapshotRecord>();
|
||||||
|
|
||||||
|
public void Dispose() { }
|
||||||
|
|
||||||
|
public void Raise(MxAlarmTransitionEvent e) => AlarmTransitionEmitted?.Invoke(this, e);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Standby fake (subtag stand-in): never throws, records that it was
|
||||||
|
/// armed, and can re-raise a transition.
|
||||||
|
/// </summary>
|
||||||
|
private sealed class StubStandby : IMxAccessAlarmConsumer
|
||||||
|
{
|
||||||
|
public event EventHandler<MxAlarmTransitionEvent>? AlarmTransitionEmitted;
|
||||||
|
|
||||||
|
public bool Subscribed;
|
||||||
|
|
||||||
|
public void Subscribe(string s) => Subscribed = true;
|
||||||
|
|
||||||
|
public void PollOnce() { }
|
||||||
|
|
||||||
|
public int AcknowledgeByGuid(Guid g, string c, string a, string b, string d, string e) => 22;
|
||||||
|
|
||||||
|
public int AcknowledgeByName(string n, string p, string gr, string c, string a, string b, string d, string e) => 22;
|
||||||
|
|
||||||
|
public IReadOnlyList<MxAlarmSnapshotRecord> SnapshotActiveAlarms() => Array.Empty<MxAlarmSnapshotRecord>();
|
||||||
|
|
||||||
|
public void Dispose() { }
|
||||||
|
|
||||||
|
public void Raise(MxAlarmTransitionEvent e) => AlarmTransitionEmitted?.Invoke(this, e);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static MxAlarmTransitionEvent SampleTransition() => new MxAlarmTransitionEvent
|
||||||
|
{
|
||||||
|
Record = new MxAlarmSnapshotRecord { AlarmGuid = Guid.NewGuid() },
|
||||||
|
PreviousState = MxAlarmStateKind.Unspecified,
|
||||||
|
};
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public void Primary_FailsThresholdTimes_SwitchesToSubtag()
|
||||||
|
{
|
||||||
|
FlakyPrimary primary = new FlakyPrimary { ThrowOnPoll = true };
|
||||||
|
StubStandby standby = new StubStandby();
|
||||||
|
FailoverSettings settings = new FailoverSettings(threshold: 3, probeIntervalSeconds: 0, stableProbes: 1);
|
||||||
|
using FailoverAlarmConsumer sut = new FailoverAlarmConsumer(primary, standby, settings);
|
||||||
|
|
||||||
|
List<AlarmProviderModeChange> changes = new List<AlarmProviderModeChange>();
|
||||||
|
sut.ProviderModeChanged += (_, e) => changes.Add(e);
|
||||||
|
|
||||||
|
sut.Subscribe(@"\\HOST\Galaxy!Area"); // failure 1 (primary), standby armed
|
||||||
|
Assert.True(standby.Subscribed);
|
||||||
|
Assert.Empty(changes);
|
||||||
|
|
||||||
|
sut.PollOnce(); // failure 2
|
||||||
|
Assert.Empty(changes);
|
||||||
|
|
||||||
|
sut.PollOnce(); // failure 3 → switch
|
||||||
|
|
||||||
|
Assert.Single(changes);
|
||||||
|
Assert.Equal(AlarmProviderMode.Subtag, changes[0].Mode);
|
||||||
|
Assert.Equal(AlarmProviderMode.Subtag, sut.Mode);
|
||||||
|
Assert.Equal(unchecked((int)0x80004005), changes[0].HResult);
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public void AfterSwitch_StandbyTransitionsAreForwarded()
|
||||||
|
{
|
||||||
|
FlakyPrimary primary = new FlakyPrimary { ThrowOnPoll = true };
|
||||||
|
StubStandby standby = new StubStandby();
|
||||||
|
FailoverSettings settings = new FailoverSettings(threshold: 1, probeIntervalSeconds: 0, stableProbes: 1);
|
||||||
|
using FailoverAlarmConsumer sut = new FailoverAlarmConsumer(primary, standby, settings);
|
||||||
|
|
||||||
|
MxAlarmTransitionEvent? forwarded = null;
|
||||||
|
sut.AlarmTransitionEmitted += (_, e) => forwarded = e;
|
||||||
|
|
||||||
|
sut.Subscribe(@"\\HOST\Galaxy!Area"); // threshold=1 → switch to Subtag immediately
|
||||||
|
Assert.Equal(AlarmProviderMode.Subtag, sut.Mode);
|
||||||
|
|
||||||
|
MxAlarmTransitionEvent transition = SampleTransition();
|
||||||
|
standby.Raise(transition);
|
||||||
|
|
||||||
|
Assert.Same(transition, forwarded);
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public void WhileDegraded_PrimaryHeals_FailsBackAfterStableProbes()
|
||||||
|
{
|
||||||
|
FlakyPrimary primary = new FlakyPrimary { ThrowOnPoll = true };
|
||||||
|
StubStandby standby = new StubStandby();
|
||||||
|
FailoverSettings settings = new FailoverSettings(threshold: 1, probeIntervalSeconds: 0, stableProbes: 2);
|
||||||
|
using FailoverAlarmConsumer sut = new FailoverAlarmConsumer(primary, standby, settings);
|
||||||
|
|
||||||
|
List<AlarmProviderModeChange> changes = new List<AlarmProviderModeChange>();
|
||||||
|
sut.ProviderModeChanged += (_, e) => changes.Add(e);
|
||||||
|
|
||||||
|
sut.Subscribe(@"\\HOST\Galaxy!Area"); // threshold=1 → Subtag (change 1)
|
||||||
|
Assert.Single(changes);
|
||||||
|
Assert.Equal(AlarmProviderMode.Subtag, changes[^1].Mode);
|
||||||
|
|
||||||
|
primary.ThrowOnPoll = false; // primary heals
|
||||||
|
|
||||||
|
sut.ProbeOnce(); // clean 1 (no failback yet)
|
||||||
|
Assert.Single(changes);
|
||||||
|
|
||||||
|
sut.ProbeOnce(); // clean 2 → failback
|
||||||
|
|
||||||
|
Assert.Equal(2, changes.Count);
|
||||||
|
Assert.Equal(AlarmProviderMode.Alarmmgr, changes[^1].Mode);
|
||||||
|
Assert.Equal(AlarmProviderMode.Alarmmgr, sut.Mode);
|
||||||
|
Assert.Equal(0, changes[^1].HResult);
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public void BeforeFailover_PrimaryTransitionsAreForwarded()
|
||||||
|
{
|
||||||
|
FlakyPrimary primary = new FlakyPrimary { ThrowOnPoll = false }; // healthy, can Raise
|
||||||
|
StubStandby standby = new StubStandby();
|
||||||
|
FailoverSettings settings = new FailoverSettings(threshold: 3, probeIntervalSeconds: 0, stableProbes: 1);
|
||||||
|
using FailoverAlarmConsumer sut = new FailoverAlarmConsumer(primary, standby, settings);
|
||||||
|
|
||||||
|
List<MxAlarmTransitionEvent> forwarded = new List<MxAlarmTransitionEvent>();
|
||||||
|
sut.AlarmTransitionEmitted += (_, e) => forwarded.Add(e);
|
||||||
|
|
||||||
|
sut.Subscribe(@"\\HOST\Galaxy!Area");
|
||||||
|
Assert.Equal(AlarmProviderMode.Alarmmgr, sut.Mode);
|
||||||
|
|
||||||
|
MxAlarmTransitionEvent fromPrimary = SampleTransition();
|
||||||
|
primary.Raise(fromPrimary); // active=Primary → forwarded
|
||||||
|
Assert.Single(forwarded);
|
||||||
|
Assert.Same(fromPrimary, forwarded[0]);
|
||||||
|
|
||||||
|
standby.Raise(SampleTransition()); // standby not active → suppressed
|
||||||
|
Assert.Single(forwarded);
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public void Acknowledge_DelegatesToActiveChild()
|
||||||
|
{
|
||||||
|
FlakyPrimary primary = new FlakyPrimary { ThrowOnPoll = false };
|
||||||
|
StubStandby standby = new StubStandby();
|
||||||
|
FailoverSettings settings = new FailoverSettings(threshold: 1, probeIntervalSeconds: 0, stableProbes: 1);
|
||||||
|
using FailoverAlarmConsumer sut = new FailoverAlarmConsumer(primary, standby, settings);
|
||||||
|
|
||||||
|
sut.Subscribe(@"\\HOST\Galaxy!Area");
|
||||||
|
|
||||||
|
// Active = Primary → primary's sentinel value (11).
|
||||||
|
Assert.Equal(11, sut.AcknowledgeByGuid(Guid.NewGuid(), "c", "n", "node", "dom", "full"));
|
||||||
|
Assert.Equal(11, sut.AcknowledgeByName("a", "p", "g", "c", "n", "node", "dom", "full"));
|
||||||
|
|
||||||
|
// Force a failover by failing the primary past threshold.
|
||||||
|
primary.ThrowOnPoll = true;
|
||||||
|
sut.PollOnce(); // threshold=1 → switch to Standby
|
||||||
|
Assert.Equal(AlarmProviderMode.Subtag, sut.Mode);
|
||||||
|
|
||||||
|
// Active = Standby → standby's sentinel value (22).
|
||||||
|
Assert.Equal(22, sut.AcknowledgeByGuid(Guid.NewGuid(), "c", "n", "node", "dom", "full"));
|
||||||
|
Assert.Equal(22, sut.AcknowledgeByName("a", "p", "g", "c", "n", "node", "dom", "full"));
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,61 @@
|
|||||||
|
using System;
|
||||||
|
using ZB.MOM.WW.MxGateway.Contracts.Proto;
|
||||||
|
|
||||||
|
namespace ZB.MOM.WW.MxGateway.Worker.MxAccess;
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Raised by <see cref="FailoverAlarmConsumer"/> every time the active
|
||||||
|
/// alarm source switches between the primary (alarmmgr) consumer and the
|
||||||
|
/// standby (subtag) consumer. The worker translates this into the proto
|
||||||
|
/// family <c>OnAlarmProviderModeChanged</c> so connected gateway clients
|
||||||
|
/// can surface the degraded/recovered state.
|
||||||
|
/// </summary>
|
||||||
|
/// <remarks>
|
||||||
|
/// <para>
|
||||||
|
/// Plain class with constructor-assigned get-only properties — not a
|
||||||
|
/// <c>record</c> or <c>init</c>-only type — because the worker
|
||||||
|
/// multi-targets .NET Framework 4.8, which lacks
|
||||||
|
/// <c>System.Runtime.CompilerServices.IsExternalInit</c> (CS0518).
|
||||||
|
/// </para>
|
||||||
|
/// </remarks>
|
||||||
|
public sealed class AlarmProviderModeChange : EventArgs
|
||||||
|
{
|
||||||
|
/// <summary>
|
||||||
|
/// Initializes the change event payload.
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="mode">The provider mode now active after the switch.</param>
|
||||||
|
/// <param name="reason">Human-readable reason for the switch.</param>
|
||||||
|
/// <param name="hResult">
|
||||||
|
/// The COM HRESULT that triggered a failover, or 0 for a clean
|
||||||
|
/// failback / no associated HRESULT.
|
||||||
|
/// </param>
|
||||||
|
/// <param name="atUtc">The UTC instant the switch occurred.</param>
|
||||||
|
public AlarmProviderModeChange(AlarmProviderMode mode, string reason, int hResult, DateTime atUtc)
|
||||||
|
{
|
||||||
|
Mode = mode;
|
||||||
|
Reason = reason ?? string.Empty;
|
||||||
|
HResult = hResult;
|
||||||
|
AtUtc = atUtc;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// The provider mode now active after the switch.
|
||||||
|
/// </summary>
|
||||||
|
public AlarmProviderMode Mode { get; }
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Human-readable reason for the switch (e.g. the failing COM call, or
|
||||||
|
/// <c>"recovered"</c> for a failback).
|
||||||
|
/// </summary>
|
||||||
|
public string Reason { get; }
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// The COM HRESULT that triggered a failover, or 0 when none applies.
|
||||||
|
/// </summary>
|
||||||
|
public int HResult { get; }
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// The UTC instant the switch occurred.
|
||||||
|
/// </summary>
|
||||||
|
public DateTime AtUtc { get; }
|
||||||
|
}
|
||||||
@@ -0,0 +1,311 @@
|
|||||||
|
using System;
|
||||||
|
using System.Collections.Generic;
|
||||||
|
using System.Runtime.InteropServices;
|
||||||
|
using ZB.MOM.WW.MxGateway.Contracts.Proto;
|
||||||
|
|
||||||
|
namespace ZB.MOM.WW.MxGateway.Worker.MxAccess;
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Composite <see cref="IMxAccessAlarmConsumer"/> that owns a PRIMARY
|
||||||
|
/// consumer (the wnwrap <see cref="WnWrapAlarmConsumer"/> alarmmgr source)
|
||||||
|
/// and a STANDBY consumer (the <c>SubtagAlarmConsumer</c> subtag fallback),
|
||||||
|
/// and switches between them automatically:
|
||||||
|
/// <list type="bullet">
|
||||||
|
/// <item><description>
|
||||||
|
/// Auto-fails-over to standby after
|
||||||
|
/// <see cref="FailoverSettings.Threshold"/> consecutive COM
|
||||||
|
/// failures on the primary.
|
||||||
|
/// </description></item>
|
||||||
|
/// <item><description>
|
||||||
|
/// Auto-fails-back to primary after
|
||||||
|
/// <see cref="FailoverSettings.StableProbes"/> consecutive clean
|
||||||
|
/// failback probes against the recovering primary.
|
||||||
|
/// </description></item>
|
||||||
|
/// </list>
|
||||||
|
/// It re-raises <see cref="AlarmTransitionEmitted"/> from whichever child
|
||||||
|
/// is active and raises <see cref="ProviderModeChanged"/> on every switch.
|
||||||
|
/// </summary>
|
||||||
|
/// <remarks>
|
||||||
|
/// <para>
|
||||||
|
/// <strong>Active-child event forwarding.</strong> This type subscribes
|
||||||
|
/// to <em>both</em> children's <see cref="AlarmTransitionEmitted"/>
|
||||||
|
/// events up front and gates re-raising on identity: a child transition
|
||||||
|
/// is forwarded only when its <c>sender</c> is the currently active
|
||||||
|
/// child. The standby is armed (subscribed) from the start so its
|
||||||
|
/// snapshot is warm at the moment of failover, but its transitions stay
|
||||||
|
/// suppressed until it becomes active. Gating-by-active is simpler and
|
||||||
|
/// less error-prone than subscribe/unsubscribe churn on every switch,
|
||||||
|
/// and it avoids a race where a transition fires during the switch.
|
||||||
|
/// </para>
|
||||||
|
/// <para>
|
||||||
|
/// <strong>Threading.</strong> Like its children, this type is driven
|
||||||
|
/// entirely on the worker's STA: <see cref="Subscribe"/>,
|
||||||
|
/// <see cref="PollOnce"/>, <see cref="ProbeOnce"/>, and the
|
||||||
|
/// <c>AcknowledgeBy*</c> calls are all invoked from the apartment that
|
||||||
|
/// owns the underlying COM objects. It owns no locks of its own and no
|
||||||
|
/// internal timer; the worker drives <see cref="PollOnce"/> on a timer.
|
||||||
|
/// </para>
|
||||||
|
/// </remarks>
|
||||||
|
public sealed class FailoverAlarmConsumer : IMxAccessAlarmConsumer
|
||||||
|
{
|
||||||
|
private enum Active
|
||||||
|
{
|
||||||
|
Primary,
|
||||||
|
Standby,
|
||||||
|
}
|
||||||
|
|
||||||
|
private readonly IMxAccessAlarmConsumer primary;
|
||||||
|
private readonly IMxAccessAlarmConsumer standby;
|
||||||
|
private readonly FailoverSettings settings;
|
||||||
|
|
||||||
|
private Active active = Active.Primary;
|
||||||
|
private AlarmProviderMode mode = AlarmProviderMode.Alarmmgr;
|
||||||
|
private int consecutiveFailures;
|
||||||
|
private int cleanProbes;
|
||||||
|
private bool disposed;
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Composes the failover consumer over its two children.
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="primary">The PRIMARY (alarmmgr) consumer.</param>
|
||||||
|
/// <param name="standby">The STANDBY (subtag) consumer.</param>
|
||||||
|
/// <param name="settings">The failover/failback tunables.</param>
|
||||||
|
public FailoverAlarmConsumer(
|
||||||
|
IMxAccessAlarmConsumer primary,
|
||||||
|
IMxAccessAlarmConsumer standby,
|
||||||
|
FailoverSettings settings)
|
||||||
|
{
|
||||||
|
this.primary = primary ?? throw new ArgumentNullException(nameof(primary));
|
||||||
|
this.standby = standby ?? throw new ArgumentNullException(nameof(standby));
|
||||||
|
this.settings = settings ?? throw new ArgumentNullException(nameof(settings));
|
||||||
|
|
||||||
|
this.primary.AlarmTransitionEmitted += OnChildTransition;
|
||||||
|
this.standby.AlarmTransitionEmitted += OnChildTransition;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <inheritdoc />
|
||||||
|
public event EventHandler<MxAlarmTransitionEvent>? AlarmTransitionEmitted;
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Fires on every switch between primary and standby. Carries the new
|
||||||
|
/// <see cref="AlarmProviderMode"/>, the reason, the triggering HRESULT
|
||||||
|
/// (0 for a clean failback), and the UTC instant.
|
||||||
|
/// </summary>
|
||||||
|
public event EventHandler<AlarmProviderModeChange>? ProviderModeChanged;
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// The provider mode currently active.
|
||||||
|
/// </summary>
|
||||||
|
public AlarmProviderMode Mode => mode;
|
||||||
|
|
||||||
|
/// <inheritdoc />
|
||||||
|
/// <remarks>
|
||||||
|
/// Arms BOTH children up front so the standby snapshot is warm at the
|
||||||
|
/// moment of failover. The standby is always subscribed even if the
|
||||||
|
/// primary's <c>Subscribe</c> throws; a standby subscribe failure is
|
||||||
|
/// surfaced (rethrown) but does not count toward primary failover. The
|
||||||
|
/// primary subscribe runs through the failure-counting wrapper so a
|
||||||
|
/// COM failure on subscribe contributes to the failover threshold.
|
||||||
|
/// </remarks>
|
||||||
|
public void Subscribe(string subscription)
|
||||||
|
{
|
||||||
|
if (disposed) throw new ObjectDisposedException(nameof(FailoverAlarmConsumer));
|
||||||
|
|
||||||
|
// Arm the standby first so it is warm regardless of primary outcome.
|
||||||
|
// A standby subscribe failure is a hard fault (the fallback itself is
|
||||||
|
// broken) and is surfaced to the caller; it does not feed the primary
|
||||||
|
// failover counter.
|
||||||
|
standby.Subscribe(subscription);
|
||||||
|
|
||||||
|
// Drive the primary subscribe through the failure-counting wrapper so
|
||||||
|
// a COM failure here counts toward the failover threshold instead of
|
||||||
|
// escaping. Swallowing the exception is deliberate: the standby is
|
||||||
|
// already armed, so a failed primary subscribe just nudges the state
|
||||||
|
// machine toward (or into) standby rather than aborting startup.
|
||||||
|
RunPrimary(() => primary.Subscribe(subscription), "Subscribe");
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <inheritdoc />
|
||||||
|
/// <remarks>
|
||||||
|
/// While the primary is active, drives <c>primary.PollOnce</c> through
|
||||||
|
/// the failure-counting wrapper. While degraded (standby active),
|
||||||
|
/// drives <c>standby.PollOnce</c> and then runs one failback probe per
|
||||||
|
/// call via <see cref="ProbeOnce"/> — the worker drives this on a
|
||||||
|
/// timer, so one degraded poll equals one probe tick.
|
||||||
|
/// </remarks>
|
||||||
|
public void PollOnce()
|
||||||
|
{
|
||||||
|
if (disposed) throw new ObjectDisposedException(nameof(FailoverAlarmConsumer));
|
||||||
|
|
||||||
|
if (active == Active.Primary)
|
||||||
|
{
|
||||||
|
RunPrimary(() => primary.PollOnce(), "PollOnce");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Degraded: pump the standby for live transitions, then probe the
|
||||||
|
// primary for recovery. Standby PollOnce is a no-op for the subtag
|
||||||
|
// consumer but kept for symmetry / future standby sources.
|
||||||
|
standby.PollOnce();
|
||||||
|
ProbeOnce();
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Runs one failback probe against the (presumed recovering) primary.
|
||||||
|
/// Only meaningful while the standby is active; a no-op otherwise.
|
||||||
|
/// </summary>
|
||||||
|
/// <remarks>
|
||||||
|
/// A clean probe (primary <c>Subscribe</c> + <c>PollOnce</c> both
|
||||||
|
/// succeed) increments the clean-probe counter and, once it reaches
|
||||||
|
/// <see cref="FailoverSettings.StableProbes"/>, fails back to the
|
||||||
|
/// primary. Any probe failure resets the clean-probe counter to 0 so
|
||||||
|
/// the consumer requires a fresh unbroken run before failing back.
|
||||||
|
/// Exposed publicly so tests (and any external scheduler honoring
|
||||||
|
/// <see cref="FailoverSettings.ProbeIntervalSeconds"/> cadence) can
|
||||||
|
/// drive it directly.
|
||||||
|
/// </remarks>
|
||||||
|
public void ProbeOnce()
|
||||||
|
{
|
||||||
|
if (disposed) throw new ObjectDisposedException(nameof(FailoverAlarmConsumer));
|
||||||
|
if (active != Active.Standby) return;
|
||||||
|
|
||||||
|
try
|
||||||
|
{
|
||||||
|
primary.Subscribe(string.Empty);
|
||||||
|
primary.PollOnce();
|
||||||
|
}
|
||||||
|
catch (Exception)
|
||||||
|
{
|
||||||
|
// Probe failed — the primary is still unhealthy. Demand a fresh
|
||||||
|
// unbroken run of StableProbes clean polls before failing back.
|
||||||
|
cleanProbes = 0;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
cleanProbes++;
|
||||||
|
if (cleanProbes >= settings.StableProbes)
|
||||||
|
{
|
||||||
|
SwitchToPrimary("recovered", 0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <inheritdoc />
|
||||||
|
public int AcknowledgeByGuid(
|
||||||
|
Guid alarmGuid,
|
||||||
|
string ackComment,
|
||||||
|
string ackOperatorName,
|
||||||
|
string ackOperatorNode,
|
||||||
|
string ackOperatorDomain,
|
||||||
|
string ackOperatorFullName)
|
||||||
|
{
|
||||||
|
if (disposed) throw new ObjectDisposedException(nameof(FailoverAlarmConsumer));
|
||||||
|
return ActiveChild.AcknowledgeByGuid(
|
||||||
|
alarmGuid, ackComment, ackOperatorName, ackOperatorNode, ackOperatorDomain, ackOperatorFullName);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <inheritdoc />
|
||||||
|
public int AcknowledgeByName(
|
||||||
|
string alarmName,
|
||||||
|
string providerName,
|
||||||
|
string groupName,
|
||||||
|
string ackComment,
|
||||||
|
string ackOperatorName,
|
||||||
|
string ackOperatorNode,
|
||||||
|
string ackOperatorDomain,
|
||||||
|
string ackOperatorFullName)
|
||||||
|
{
|
||||||
|
if (disposed) throw new ObjectDisposedException(nameof(FailoverAlarmConsumer));
|
||||||
|
return ActiveChild.AcknowledgeByName(
|
||||||
|
alarmName, providerName, groupName, ackComment,
|
||||||
|
ackOperatorName, ackOperatorNode, ackOperatorDomain, ackOperatorFullName);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <inheritdoc />
|
||||||
|
public IReadOnlyList<MxAlarmSnapshotRecord> SnapshotActiveAlarms()
|
||||||
|
{
|
||||||
|
if (disposed) throw new ObjectDisposedException(nameof(FailoverAlarmConsumer));
|
||||||
|
return ActiveChild.SnapshotActiveAlarms();
|
||||||
|
}
|
||||||
|
|
||||||
|
private IMxAccessAlarmConsumer ActiveChild => active == Active.Primary ? primary : standby;
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Runs a primary COM action, counting consecutive failures. A
|
||||||
|
/// <see cref="COMException"/> (or any exception, treated as a COM
|
||||||
|
/// failure) increments the failure counter and, at
|
||||||
|
/// <see cref="FailoverSettings.Threshold"/> while the primary is still
|
||||||
|
/// active, switches to the standby. A success resets the counter.
|
||||||
|
/// </summary>
|
||||||
|
private void RunPrimary(Action action, string operation)
|
||||||
|
{
|
||||||
|
try
|
||||||
|
{
|
||||||
|
action();
|
||||||
|
}
|
||||||
|
catch (Exception ex)
|
||||||
|
{
|
||||||
|
consecutiveFailures++;
|
||||||
|
int hresult = ex is COMException ? ex.HResult : 0;
|
||||||
|
if (active == Active.Primary && consecutiveFailures >= settings.Threshold)
|
||||||
|
{
|
||||||
|
SwitchToStandby($"primary {operation} failed", hresult);
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
consecutiveFailures = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void SwitchToStandby(string reason, int hresult)
|
||||||
|
{
|
||||||
|
active = Active.Standby;
|
||||||
|
mode = AlarmProviderMode.Subtag;
|
||||||
|
consecutiveFailures = 0;
|
||||||
|
cleanProbes = 0;
|
||||||
|
|
||||||
|
// Warm the standby snapshot for the gateway hand-off. The gateway
|
||||||
|
// reconciles state from this snapshot, so the return value is not
|
||||||
|
// consumed here — the call exists for its priming side effect.
|
||||||
|
_ = standby.SnapshotActiveAlarms();
|
||||||
|
|
||||||
|
RaiseModeChanged(AlarmProviderMode.Subtag, reason, hresult);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void SwitchToPrimary(string reason, int hresult)
|
||||||
|
{
|
||||||
|
active = Active.Primary;
|
||||||
|
mode = AlarmProviderMode.Alarmmgr;
|
||||||
|
consecutiveFailures = 0;
|
||||||
|
cleanProbes = 0;
|
||||||
|
RaiseModeChanged(AlarmProviderMode.Alarmmgr, reason, hresult);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void RaiseModeChanged(AlarmProviderMode newMode, string reason, int hresult)
|
||||||
|
{
|
||||||
|
ProviderModeChanged?.Invoke(
|
||||||
|
this,
|
||||||
|
new AlarmProviderModeChange(newMode, reason, hresult, DateTime.UtcNow));
|
||||||
|
}
|
||||||
|
|
||||||
|
private void OnChildTransition(object? sender, MxAlarmTransitionEvent e)
|
||||||
|
{
|
||||||
|
// Gate by active child: forward only the active source's transitions.
|
||||||
|
if (ReferenceEquals(sender, ActiveChild))
|
||||||
|
{
|
||||||
|
AlarmTransitionEmitted?.Invoke(this, e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <inheritdoc />
|
||||||
|
public void Dispose()
|
||||||
|
{
|
||||||
|
if (disposed) return;
|
||||||
|
disposed = true;
|
||||||
|
|
||||||
|
primary.AlarmTransitionEmitted -= OnChildTransition;
|
||||||
|
standby.AlarmTransitionEmitted -= OnChildTransition;
|
||||||
|
|
||||||
|
primary.Dispose();
|
||||||
|
standby.Dispose();
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,57 @@
|
|||||||
|
namespace ZB.MOM.WW.MxGateway.Worker.MxAccess;
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Tunables for <see cref="FailoverAlarmConsumer"/>'s auto-failover /
|
||||||
|
/// auto-failback state machine. Constructor-clamped to safe minimums so a
|
||||||
|
/// misconfigured options bind can never produce a zero/negative threshold
|
||||||
|
/// that would either never fail over or fail over on the first hiccup
|
||||||
|
/// unintentionally.
|
||||||
|
/// </summary>
|
||||||
|
/// <remarks>
|
||||||
|
/// <para>
|
||||||
|
/// Plain class with constructor-assigned get-only properties — not a
|
||||||
|
/// <c>record</c> or <c>init</c>-only type — because the worker
|
||||||
|
/// multi-targets .NET Framework 4.8, which lacks
|
||||||
|
/// <c>System.Runtime.CompilerServices.IsExternalInit</c> and so cannot
|
||||||
|
/// compile <c>init</c> accessors or positional records (CS0518).
|
||||||
|
/// </para>
|
||||||
|
/// </remarks>
|
||||||
|
public sealed class FailoverSettings
|
||||||
|
{
|
||||||
|
/// <summary>
|
||||||
|
/// Initializes the settings, clamping each value to its safe minimum.
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="threshold">
|
||||||
|
/// Consecutive primary COM failures that trigger a switch to standby.
|
||||||
|
/// Clamped to a minimum of 1.
|
||||||
|
/// </param>
|
||||||
|
/// <param name="probeIntervalSeconds">
|
||||||
|
/// Minimum spacing (seconds) between failback probes against the
|
||||||
|
/// recovering primary. Clamped to a minimum of 0 (probe every tick).
|
||||||
|
/// </param>
|
||||||
|
/// <param name="stableProbes">
|
||||||
|
/// Consecutive clean failback probes required before switching back to
|
||||||
|
/// the primary. Clamped to a minimum of 1.
|
||||||
|
/// </param>
|
||||||
|
public FailoverSettings(int threshold, int probeIntervalSeconds, int stableProbes)
|
||||||
|
{
|
||||||
|
Threshold = threshold < 1 ? 1 : threshold;
|
||||||
|
ProbeIntervalSeconds = probeIntervalSeconds < 0 ? 0 : probeIntervalSeconds;
|
||||||
|
StableProbes = stableProbes < 1 ? 1 : stableProbes;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Consecutive primary COM failures that trigger a switch to standby.
|
||||||
|
/// </summary>
|
||||||
|
public int Threshold { get; }
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Minimum spacing, in seconds, between failback probes.
|
||||||
|
/// </summary>
|
||||||
|
public int ProbeIntervalSeconds { get; }
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Consecutive clean failback probes required before failing back.
|
||||||
|
/// </summary>
|
||||||
|
public int StableProbes { get; }
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user