worker(alarms): FailoverAlarmConsumer auto-failover/failback state machine
This commit is contained in:
@@ -0,0 +1,211 @@
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using ZB.MOM.WW.MxGateway.Contracts.Proto;
|
||||
using ZB.MOM.WW.MxGateway.Worker.MxAccess;
|
||||
using Xunit;
|
||||
|
||||
namespace ZB.MOM.WW.MxGateway.Worker.Tests.MxAccess;
|
||||
|
||||
/// <summary>
|
||||
/// Unit tests for <see cref="FailoverAlarmConsumer"/>: prove the
|
||||
/// auto-failover (consecutive primary COM failures → standby) and
|
||||
/// auto-failback (consecutive clean probes → primary) state machine,
|
||||
/// active-child transition forwarding, and active-child delegation of
|
||||
/// acknowledgments. Fakes stand in for both children so this needs no
|
||||
/// AVEVA install.
|
||||
/// </summary>
|
||||
public sealed class FailoverAlarmConsumerTests
|
||||
{
|
||||
/// <summary>
|
||||
/// Primary fake whose Subscribe/PollOnce throw a COMException while
|
||||
/// <see cref="ThrowOnPoll"/> is set, modeling a wnwrap consumer that
|
||||
/// surfaces COM HRESULT failures. Can also re-raise a transition so
|
||||
/// before-failover forwarding can be exercised.
|
||||
/// </summary>
|
||||
private sealed class FlakyPrimary : IMxAccessAlarmConsumer
|
||||
{
|
||||
public event EventHandler<MxAlarmTransitionEvent>? AlarmTransitionEmitted;
|
||||
|
||||
public bool ThrowOnPoll = true;
|
||||
public int Polls;
|
||||
|
||||
public void Subscribe(string s)
|
||||
{
|
||||
if (ThrowOnPoll)
|
||||
{
|
||||
throw new System.Runtime.InteropServices.COMException("boom", unchecked((int)0x80004005));
|
||||
}
|
||||
}
|
||||
|
||||
public void PollOnce()
|
||||
{
|
||||
Polls++;
|
||||
if (ThrowOnPoll)
|
||||
{
|
||||
throw new System.Runtime.InteropServices.COMException("boom", unchecked((int)0x80004005));
|
||||
}
|
||||
}
|
||||
|
||||
public int AcknowledgeByGuid(Guid g, string c, string a, string b, string d, string e) => 11;
|
||||
|
||||
public int AcknowledgeByName(string n, string p, string gr, string c, string a, string b, string d, string e) => 11;
|
||||
|
||||
public IReadOnlyList<MxAlarmSnapshotRecord> SnapshotActiveAlarms() => Array.Empty<MxAlarmSnapshotRecord>();
|
||||
|
||||
public void Dispose() { }
|
||||
|
||||
public void Raise(MxAlarmTransitionEvent e) => AlarmTransitionEmitted?.Invoke(this, e);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Standby fake (subtag stand-in): never throws, records that it was
|
||||
/// armed, and can re-raise a transition.
|
||||
/// </summary>
|
||||
private sealed class StubStandby : IMxAccessAlarmConsumer
|
||||
{
|
||||
public event EventHandler<MxAlarmTransitionEvent>? AlarmTransitionEmitted;
|
||||
|
||||
public bool Subscribed;
|
||||
|
||||
public void Subscribe(string s) => Subscribed = true;
|
||||
|
||||
public void PollOnce() { }
|
||||
|
||||
public int AcknowledgeByGuid(Guid g, string c, string a, string b, string d, string e) => 22;
|
||||
|
||||
public int AcknowledgeByName(string n, string p, string gr, string c, string a, string b, string d, string e) => 22;
|
||||
|
||||
public IReadOnlyList<MxAlarmSnapshotRecord> SnapshotActiveAlarms() => Array.Empty<MxAlarmSnapshotRecord>();
|
||||
|
||||
public void Dispose() { }
|
||||
|
||||
public void Raise(MxAlarmTransitionEvent e) => AlarmTransitionEmitted?.Invoke(this, e);
|
||||
}
|
||||
|
||||
private static MxAlarmTransitionEvent SampleTransition() => new MxAlarmTransitionEvent
|
||||
{
|
||||
Record = new MxAlarmSnapshotRecord { AlarmGuid = Guid.NewGuid() },
|
||||
PreviousState = MxAlarmStateKind.Unspecified,
|
||||
};
|
||||
|
||||
[Fact]
|
||||
public void Primary_FailsThresholdTimes_SwitchesToSubtag()
|
||||
{
|
||||
FlakyPrimary primary = new FlakyPrimary { ThrowOnPoll = true };
|
||||
StubStandby standby = new StubStandby();
|
||||
FailoverSettings settings = new FailoverSettings(threshold: 3, probeIntervalSeconds: 0, stableProbes: 1);
|
||||
using FailoverAlarmConsumer sut = new FailoverAlarmConsumer(primary, standby, settings);
|
||||
|
||||
List<AlarmProviderModeChange> changes = new List<AlarmProviderModeChange>();
|
||||
sut.ProviderModeChanged += (_, e) => changes.Add(e);
|
||||
|
||||
sut.Subscribe(@"\\HOST\Galaxy!Area"); // failure 1 (primary), standby armed
|
||||
Assert.True(standby.Subscribed);
|
||||
Assert.Empty(changes);
|
||||
|
||||
sut.PollOnce(); // failure 2
|
||||
Assert.Empty(changes);
|
||||
|
||||
sut.PollOnce(); // failure 3 → switch
|
||||
|
||||
Assert.Single(changes);
|
||||
Assert.Equal(AlarmProviderMode.Subtag, changes[0].Mode);
|
||||
Assert.Equal(AlarmProviderMode.Subtag, sut.Mode);
|
||||
Assert.Equal(unchecked((int)0x80004005), changes[0].HResult);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void AfterSwitch_StandbyTransitionsAreForwarded()
|
||||
{
|
||||
FlakyPrimary primary = new FlakyPrimary { ThrowOnPoll = true };
|
||||
StubStandby standby = new StubStandby();
|
||||
FailoverSettings settings = new FailoverSettings(threshold: 1, probeIntervalSeconds: 0, stableProbes: 1);
|
||||
using FailoverAlarmConsumer sut = new FailoverAlarmConsumer(primary, standby, settings);
|
||||
|
||||
MxAlarmTransitionEvent? forwarded = null;
|
||||
sut.AlarmTransitionEmitted += (_, e) => forwarded = e;
|
||||
|
||||
sut.Subscribe(@"\\HOST\Galaxy!Area"); // threshold=1 → switch to Subtag immediately
|
||||
Assert.Equal(AlarmProviderMode.Subtag, sut.Mode);
|
||||
|
||||
MxAlarmTransitionEvent transition = SampleTransition();
|
||||
standby.Raise(transition);
|
||||
|
||||
Assert.Same(transition, forwarded);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void WhileDegraded_PrimaryHeals_FailsBackAfterStableProbes()
|
||||
{
|
||||
FlakyPrimary primary = new FlakyPrimary { ThrowOnPoll = true };
|
||||
StubStandby standby = new StubStandby();
|
||||
FailoverSettings settings = new FailoverSettings(threshold: 1, probeIntervalSeconds: 0, stableProbes: 2);
|
||||
using FailoverAlarmConsumer sut = new FailoverAlarmConsumer(primary, standby, settings);
|
||||
|
||||
List<AlarmProviderModeChange> changes = new List<AlarmProviderModeChange>();
|
||||
sut.ProviderModeChanged += (_, e) => changes.Add(e);
|
||||
|
||||
sut.Subscribe(@"\\HOST\Galaxy!Area"); // threshold=1 → Subtag (change 1)
|
||||
Assert.Single(changes);
|
||||
Assert.Equal(AlarmProviderMode.Subtag, changes[^1].Mode);
|
||||
|
||||
primary.ThrowOnPoll = false; // primary heals
|
||||
|
||||
sut.ProbeOnce(); // clean 1 (no failback yet)
|
||||
Assert.Single(changes);
|
||||
|
||||
sut.ProbeOnce(); // clean 2 → failback
|
||||
|
||||
Assert.Equal(2, changes.Count);
|
||||
Assert.Equal(AlarmProviderMode.Alarmmgr, changes[^1].Mode);
|
||||
Assert.Equal(AlarmProviderMode.Alarmmgr, sut.Mode);
|
||||
Assert.Equal(0, changes[^1].HResult);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void BeforeFailover_PrimaryTransitionsAreForwarded()
|
||||
{
|
||||
FlakyPrimary primary = new FlakyPrimary { ThrowOnPoll = false }; // healthy, can Raise
|
||||
StubStandby standby = new StubStandby();
|
||||
FailoverSettings settings = new FailoverSettings(threshold: 3, probeIntervalSeconds: 0, stableProbes: 1);
|
||||
using FailoverAlarmConsumer sut = new FailoverAlarmConsumer(primary, standby, settings);
|
||||
|
||||
List<MxAlarmTransitionEvent> forwarded = new List<MxAlarmTransitionEvent>();
|
||||
sut.AlarmTransitionEmitted += (_, e) => forwarded.Add(e);
|
||||
|
||||
sut.Subscribe(@"\\HOST\Galaxy!Area");
|
||||
Assert.Equal(AlarmProviderMode.Alarmmgr, sut.Mode);
|
||||
|
||||
MxAlarmTransitionEvent fromPrimary = SampleTransition();
|
||||
primary.Raise(fromPrimary); // active=Primary → forwarded
|
||||
Assert.Single(forwarded);
|
||||
Assert.Same(fromPrimary, forwarded[0]);
|
||||
|
||||
standby.Raise(SampleTransition()); // standby not active → suppressed
|
||||
Assert.Single(forwarded);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Acknowledge_DelegatesToActiveChild()
|
||||
{
|
||||
FlakyPrimary primary = new FlakyPrimary { ThrowOnPoll = false };
|
||||
StubStandby standby = new StubStandby();
|
||||
FailoverSettings settings = new FailoverSettings(threshold: 1, probeIntervalSeconds: 0, stableProbes: 1);
|
||||
using FailoverAlarmConsumer sut = new FailoverAlarmConsumer(primary, standby, settings);
|
||||
|
||||
sut.Subscribe(@"\\HOST\Galaxy!Area");
|
||||
|
||||
// Active = Primary → primary's sentinel value (11).
|
||||
Assert.Equal(11, sut.AcknowledgeByGuid(Guid.NewGuid(), "c", "n", "node", "dom", "full"));
|
||||
Assert.Equal(11, sut.AcknowledgeByName("a", "p", "g", "c", "n", "node", "dom", "full"));
|
||||
|
||||
// Force a failover by failing the primary past threshold.
|
||||
primary.ThrowOnPoll = true;
|
||||
sut.PollOnce(); // threshold=1 → switch to Standby
|
||||
Assert.Equal(AlarmProviderMode.Subtag, sut.Mode);
|
||||
|
||||
// Active = Standby → standby's sentinel value (22).
|
||||
Assert.Equal(22, sut.AcknowledgeByGuid(Guid.NewGuid(), "c", "n", "node", "dom", "full"));
|
||||
Assert.Equal(22, sut.AcknowledgeByName("a", "p", "g", "c", "n", "node", "dom", "full"));
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,61 @@
|
||||
using System;
|
||||
using ZB.MOM.WW.MxGateway.Contracts.Proto;
|
||||
|
||||
namespace ZB.MOM.WW.MxGateway.Worker.MxAccess;
|
||||
|
||||
/// <summary>
|
||||
/// Raised by <see cref="FailoverAlarmConsumer"/> every time the active
|
||||
/// alarm source switches between the primary (alarmmgr) consumer and the
|
||||
/// standby (subtag) consumer. The worker translates this into the proto
|
||||
/// family <c>OnAlarmProviderModeChanged</c> so connected gateway clients
|
||||
/// can surface the degraded/recovered state.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// Plain class with constructor-assigned get-only properties — not a
|
||||
/// <c>record</c> or <c>init</c>-only type — because the worker
|
||||
/// multi-targets .NET Framework 4.8, which lacks
|
||||
/// <c>System.Runtime.CompilerServices.IsExternalInit</c> (CS0518).
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public sealed class AlarmProviderModeChange : EventArgs
|
||||
{
|
||||
/// <summary>
|
||||
/// Initializes the change event payload.
|
||||
/// </summary>
|
||||
/// <param name="mode">The provider mode now active after the switch.</param>
|
||||
/// <param name="reason">Human-readable reason for the switch.</param>
|
||||
/// <param name="hResult">
|
||||
/// The COM HRESULT that triggered a failover, or 0 for a clean
|
||||
/// failback / no associated HRESULT.
|
||||
/// </param>
|
||||
/// <param name="atUtc">The UTC instant the switch occurred.</param>
|
||||
public AlarmProviderModeChange(AlarmProviderMode mode, string reason, int hResult, DateTime atUtc)
|
||||
{
|
||||
Mode = mode;
|
||||
Reason = reason ?? string.Empty;
|
||||
HResult = hResult;
|
||||
AtUtc = atUtc;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// The provider mode now active after the switch.
|
||||
/// </summary>
|
||||
public AlarmProviderMode Mode { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Human-readable reason for the switch (e.g. the failing COM call, or
|
||||
/// <c>"recovered"</c> for a failback).
|
||||
/// </summary>
|
||||
public string Reason { get; }
|
||||
|
||||
/// <summary>
|
||||
/// The COM HRESULT that triggered a failover, or 0 when none applies.
|
||||
/// </summary>
|
||||
public int HResult { get; }
|
||||
|
||||
/// <summary>
|
||||
/// The UTC instant the switch occurred.
|
||||
/// </summary>
|
||||
public DateTime AtUtc { get; }
|
||||
}
|
||||
@@ -0,0 +1,311 @@
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Runtime.InteropServices;
|
||||
using ZB.MOM.WW.MxGateway.Contracts.Proto;
|
||||
|
||||
namespace ZB.MOM.WW.MxGateway.Worker.MxAccess;
|
||||
|
||||
/// <summary>
|
||||
/// Composite <see cref="IMxAccessAlarmConsumer"/> that owns a PRIMARY
|
||||
/// consumer (the wnwrap <see cref="WnWrapAlarmConsumer"/> alarmmgr source)
|
||||
/// and a STANDBY consumer (the <c>SubtagAlarmConsumer</c> subtag fallback),
|
||||
/// and switches between them automatically:
|
||||
/// <list type="bullet">
|
||||
/// <item><description>
|
||||
/// Auto-fails-over to standby after
|
||||
/// <see cref="FailoverSettings.Threshold"/> consecutive COM
|
||||
/// failures on the primary.
|
||||
/// </description></item>
|
||||
/// <item><description>
|
||||
/// Auto-fails-back to primary after
|
||||
/// <see cref="FailoverSettings.StableProbes"/> consecutive clean
|
||||
/// failback probes against the recovering primary.
|
||||
/// </description></item>
|
||||
/// </list>
|
||||
/// It re-raises <see cref="AlarmTransitionEmitted"/> from whichever child
|
||||
/// is active and raises <see cref="ProviderModeChanged"/> on every switch.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// <strong>Active-child event forwarding.</strong> This type subscribes
|
||||
/// to <em>both</em> children's <see cref="AlarmTransitionEmitted"/>
|
||||
/// events up front and gates re-raising on identity: a child transition
|
||||
/// is forwarded only when its <c>sender</c> is the currently active
|
||||
/// child. The standby is armed (subscribed) from the start so its
|
||||
/// snapshot is warm at the moment of failover, but its transitions stay
|
||||
/// suppressed until it becomes active. Gating-by-active is simpler and
|
||||
/// less error-prone than subscribe/unsubscribe churn on every switch,
|
||||
/// and it avoids a race where a transition fires during the switch.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// <strong>Threading.</strong> Like its children, this type is driven
|
||||
/// entirely on the worker's STA: <see cref="Subscribe"/>,
|
||||
/// <see cref="PollOnce"/>, <see cref="ProbeOnce"/>, and the
|
||||
/// <c>AcknowledgeBy*</c> calls are all invoked from the apartment that
|
||||
/// owns the underlying COM objects. It owns no locks of its own and no
|
||||
/// internal timer; the worker drives <see cref="PollOnce"/> on a timer.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public sealed class FailoverAlarmConsumer : IMxAccessAlarmConsumer
|
||||
{
|
||||
private enum Active
|
||||
{
|
||||
Primary,
|
||||
Standby,
|
||||
}
|
||||
|
||||
private readonly IMxAccessAlarmConsumer primary;
|
||||
private readonly IMxAccessAlarmConsumer standby;
|
||||
private readonly FailoverSettings settings;
|
||||
|
||||
private Active active = Active.Primary;
|
||||
private AlarmProviderMode mode = AlarmProviderMode.Alarmmgr;
|
||||
private int consecutiveFailures;
|
||||
private int cleanProbes;
|
||||
private bool disposed;
|
||||
|
||||
/// <summary>
|
||||
/// Composes the failover consumer over its two children.
|
||||
/// </summary>
|
||||
/// <param name="primary">The PRIMARY (alarmmgr) consumer.</param>
|
||||
/// <param name="standby">The STANDBY (subtag) consumer.</param>
|
||||
/// <param name="settings">The failover/failback tunables.</param>
|
||||
public FailoverAlarmConsumer(
|
||||
IMxAccessAlarmConsumer primary,
|
||||
IMxAccessAlarmConsumer standby,
|
||||
FailoverSettings settings)
|
||||
{
|
||||
this.primary = primary ?? throw new ArgumentNullException(nameof(primary));
|
||||
this.standby = standby ?? throw new ArgumentNullException(nameof(standby));
|
||||
this.settings = settings ?? throw new ArgumentNullException(nameof(settings));
|
||||
|
||||
this.primary.AlarmTransitionEmitted += OnChildTransition;
|
||||
this.standby.AlarmTransitionEmitted += OnChildTransition;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public event EventHandler<MxAlarmTransitionEvent>? AlarmTransitionEmitted;
|
||||
|
||||
/// <summary>
|
||||
/// Fires on every switch between primary and standby. Carries the new
|
||||
/// <see cref="AlarmProviderMode"/>, the reason, the triggering HRESULT
|
||||
/// (0 for a clean failback), and the UTC instant.
|
||||
/// </summary>
|
||||
public event EventHandler<AlarmProviderModeChange>? ProviderModeChanged;
|
||||
|
||||
/// <summary>
|
||||
/// The provider mode currently active.
|
||||
/// </summary>
|
||||
public AlarmProviderMode Mode => mode;
|
||||
|
||||
/// <inheritdoc />
|
||||
/// <remarks>
|
||||
/// Arms BOTH children up front so the standby snapshot is warm at the
|
||||
/// moment of failover. The standby is always subscribed even if the
|
||||
/// primary's <c>Subscribe</c> throws; a standby subscribe failure is
|
||||
/// surfaced (rethrown) but does not count toward primary failover. The
|
||||
/// primary subscribe runs through the failure-counting wrapper so a
|
||||
/// COM failure on subscribe contributes to the failover threshold.
|
||||
/// </remarks>
|
||||
public void Subscribe(string subscription)
|
||||
{
|
||||
if (disposed) throw new ObjectDisposedException(nameof(FailoverAlarmConsumer));
|
||||
|
||||
// Arm the standby first so it is warm regardless of primary outcome.
|
||||
// A standby subscribe failure is a hard fault (the fallback itself is
|
||||
// broken) and is surfaced to the caller; it does not feed the primary
|
||||
// failover counter.
|
||||
standby.Subscribe(subscription);
|
||||
|
||||
// Drive the primary subscribe through the failure-counting wrapper so
|
||||
// a COM failure here counts toward the failover threshold instead of
|
||||
// escaping. Swallowing the exception is deliberate: the standby is
|
||||
// already armed, so a failed primary subscribe just nudges the state
|
||||
// machine toward (or into) standby rather than aborting startup.
|
||||
RunPrimary(() => primary.Subscribe(subscription), "Subscribe");
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
/// <remarks>
|
||||
/// While the primary is active, drives <c>primary.PollOnce</c> through
|
||||
/// the failure-counting wrapper. While degraded (standby active),
|
||||
/// drives <c>standby.PollOnce</c> and then runs one failback probe per
|
||||
/// call via <see cref="ProbeOnce"/> — the worker drives this on a
|
||||
/// timer, so one degraded poll equals one probe tick.
|
||||
/// </remarks>
|
||||
public void PollOnce()
|
||||
{
|
||||
if (disposed) throw new ObjectDisposedException(nameof(FailoverAlarmConsumer));
|
||||
|
||||
if (active == Active.Primary)
|
||||
{
|
||||
RunPrimary(() => primary.PollOnce(), "PollOnce");
|
||||
return;
|
||||
}
|
||||
|
||||
// Degraded: pump the standby for live transitions, then probe the
|
||||
// primary for recovery. Standby PollOnce is a no-op for the subtag
|
||||
// consumer but kept for symmetry / future standby sources.
|
||||
standby.PollOnce();
|
||||
ProbeOnce();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Runs one failback probe against the (presumed recovering) primary.
|
||||
/// Only meaningful while the standby is active; a no-op otherwise.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// A clean probe (primary <c>Subscribe</c> + <c>PollOnce</c> both
|
||||
/// succeed) increments the clean-probe counter and, once it reaches
|
||||
/// <see cref="FailoverSettings.StableProbes"/>, fails back to the
|
||||
/// primary. Any probe failure resets the clean-probe counter to 0 so
|
||||
/// the consumer requires a fresh unbroken run before failing back.
|
||||
/// Exposed publicly so tests (and any external scheduler honoring
|
||||
/// <see cref="FailoverSettings.ProbeIntervalSeconds"/> cadence) can
|
||||
/// drive it directly.
|
||||
/// </remarks>
|
||||
public void ProbeOnce()
|
||||
{
|
||||
if (disposed) throw new ObjectDisposedException(nameof(FailoverAlarmConsumer));
|
||||
if (active != Active.Standby) return;
|
||||
|
||||
try
|
||||
{
|
||||
primary.Subscribe(string.Empty);
|
||||
primary.PollOnce();
|
||||
}
|
||||
catch (Exception)
|
||||
{
|
||||
// Probe failed — the primary is still unhealthy. Demand a fresh
|
||||
// unbroken run of StableProbes clean polls before failing back.
|
||||
cleanProbes = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
cleanProbes++;
|
||||
if (cleanProbes >= settings.StableProbes)
|
||||
{
|
||||
SwitchToPrimary("recovered", 0);
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public int AcknowledgeByGuid(
|
||||
Guid alarmGuid,
|
||||
string ackComment,
|
||||
string ackOperatorName,
|
||||
string ackOperatorNode,
|
||||
string ackOperatorDomain,
|
||||
string ackOperatorFullName)
|
||||
{
|
||||
if (disposed) throw new ObjectDisposedException(nameof(FailoverAlarmConsumer));
|
||||
return ActiveChild.AcknowledgeByGuid(
|
||||
alarmGuid, ackComment, ackOperatorName, ackOperatorNode, ackOperatorDomain, ackOperatorFullName);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public int AcknowledgeByName(
|
||||
string alarmName,
|
||||
string providerName,
|
||||
string groupName,
|
||||
string ackComment,
|
||||
string ackOperatorName,
|
||||
string ackOperatorNode,
|
||||
string ackOperatorDomain,
|
||||
string ackOperatorFullName)
|
||||
{
|
||||
if (disposed) throw new ObjectDisposedException(nameof(FailoverAlarmConsumer));
|
||||
return ActiveChild.AcknowledgeByName(
|
||||
alarmName, providerName, groupName, ackComment,
|
||||
ackOperatorName, ackOperatorNode, ackOperatorDomain, ackOperatorFullName);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public IReadOnlyList<MxAlarmSnapshotRecord> SnapshotActiveAlarms()
|
||||
{
|
||||
if (disposed) throw new ObjectDisposedException(nameof(FailoverAlarmConsumer));
|
||||
return ActiveChild.SnapshotActiveAlarms();
|
||||
}
|
||||
|
||||
private IMxAccessAlarmConsumer ActiveChild => active == Active.Primary ? primary : standby;
|
||||
|
||||
/// <summary>
|
||||
/// Runs a primary COM action, counting consecutive failures. A
|
||||
/// <see cref="COMException"/> (or any exception, treated as a COM
|
||||
/// failure) increments the failure counter and, at
|
||||
/// <see cref="FailoverSettings.Threshold"/> while the primary is still
|
||||
/// active, switches to the standby. A success resets the counter.
|
||||
/// </summary>
|
||||
private void RunPrimary(Action action, string operation)
|
||||
{
|
||||
try
|
||||
{
|
||||
action();
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
consecutiveFailures++;
|
||||
int hresult = ex is COMException ? ex.HResult : 0;
|
||||
if (active == Active.Primary && consecutiveFailures >= settings.Threshold)
|
||||
{
|
||||
SwitchToStandby($"primary {operation} failed", hresult);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
consecutiveFailures = 0;
|
||||
}
|
||||
|
||||
private void SwitchToStandby(string reason, int hresult)
|
||||
{
|
||||
active = Active.Standby;
|
||||
mode = AlarmProviderMode.Subtag;
|
||||
consecutiveFailures = 0;
|
||||
cleanProbes = 0;
|
||||
|
||||
// Warm the standby snapshot for the gateway hand-off. The gateway
|
||||
// reconciles state from this snapshot, so the return value is not
|
||||
// consumed here — the call exists for its priming side effect.
|
||||
_ = standby.SnapshotActiveAlarms();
|
||||
|
||||
RaiseModeChanged(AlarmProviderMode.Subtag, reason, hresult);
|
||||
}
|
||||
|
||||
private void SwitchToPrimary(string reason, int hresult)
|
||||
{
|
||||
active = Active.Primary;
|
||||
mode = AlarmProviderMode.Alarmmgr;
|
||||
consecutiveFailures = 0;
|
||||
cleanProbes = 0;
|
||||
RaiseModeChanged(AlarmProviderMode.Alarmmgr, reason, hresult);
|
||||
}
|
||||
|
||||
private void RaiseModeChanged(AlarmProviderMode newMode, string reason, int hresult)
|
||||
{
|
||||
ProviderModeChanged?.Invoke(
|
||||
this,
|
||||
new AlarmProviderModeChange(newMode, reason, hresult, DateTime.UtcNow));
|
||||
}
|
||||
|
||||
private void OnChildTransition(object? sender, MxAlarmTransitionEvent e)
|
||||
{
|
||||
// Gate by active child: forward only the active source's transitions.
|
||||
if (ReferenceEquals(sender, ActiveChild))
|
||||
{
|
||||
AlarmTransitionEmitted?.Invoke(this, e);
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public void Dispose()
|
||||
{
|
||||
if (disposed) return;
|
||||
disposed = true;
|
||||
|
||||
primary.AlarmTransitionEmitted -= OnChildTransition;
|
||||
standby.AlarmTransitionEmitted -= OnChildTransition;
|
||||
|
||||
primary.Dispose();
|
||||
standby.Dispose();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,57 @@
|
||||
namespace ZB.MOM.WW.MxGateway.Worker.MxAccess;
|
||||
|
||||
/// <summary>
|
||||
/// Tunables for <see cref="FailoverAlarmConsumer"/>'s auto-failover /
|
||||
/// auto-failback state machine. Constructor-clamped to safe minimums so a
|
||||
/// misconfigured options bind can never produce a zero/negative threshold
|
||||
/// that would either never fail over or fail over on the first hiccup
|
||||
/// unintentionally.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// Plain class with constructor-assigned get-only properties — not a
|
||||
/// <c>record</c> or <c>init</c>-only type — because the worker
|
||||
/// multi-targets .NET Framework 4.8, which lacks
|
||||
/// <c>System.Runtime.CompilerServices.IsExternalInit</c> and so cannot
|
||||
/// compile <c>init</c> accessors or positional records (CS0518).
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public sealed class FailoverSettings
|
||||
{
|
||||
/// <summary>
|
||||
/// Initializes the settings, clamping each value to its safe minimum.
|
||||
/// </summary>
|
||||
/// <param name="threshold">
|
||||
/// Consecutive primary COM failures that trigger a switch to standby.
|
||||
/// Clamped to a minimum of 1.
|
||||
/// </param>
|
||||
/// <param name="probeIntervalSeconds">
|
||||
/// Minimum spacing (seconds) between failback probes against the
|
||||
/// recovering primary. Clamped to a minimum of 0 (probe every tick).
|
||||
/// </param>
|
||||
/// <param name="stableProbes">
|
||||
/// Consecutive clean failback probes required before switching back to
|
||||
/// the primary. Clamped to a minimum of 1.
|
||||
/// </param>
|
||||
public FailoverSettings(int threshold, int probeIntervalSeconds, int stableProbes)
|
||||
{
|
||||
Threshold = threshold < 1 ? 1 : threshold;
|
||||
ProbeIntervalSeconds = probeIntervalSeconds < 0 ? 0 : probeIntervalSeconds;
|
||||
StableProbes = stableProbes < 1 ? 1 : stableProbes;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Consecutive primary COM failures that trigger a switch to standby.
|
||||
/// </summary>
|
||||
public int Threshold { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Minimum spacing, in seconds, between failback probes.
|
||||
/// </summary>
|
||||
public int ProbeIntervalSeconds { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Consecutive clean failback probes required before failing back.
|
||||
/// </summary>
|
||||
public int StableProbes { get; }
|
||||
}
|
||||
Reference in New Issue
Block a user