worker(alarms): FailoverAlarmConsumer auto-failover/failback state machine
This commit is contained in:
@@ -0,0 +1,61 @@
|
||||
using System;
|
||||
using ZB.MOM.WW.MxGateway.Contracts.Proto;
|
||||
|
||||
namespace ZB.MOM.WW.MxGateway.Worker.MxAccess;
|
||||
|
||||
/// <summary>
|
||||
/// Raised by <see cref="FailoverAlarmConsumer"/> every time the active
|
||||
/// alarm source switches between the primary (alarmmgr) consumer and the
|
||||
/// standby (subtag) consumer. The worker translates this into the proto
|
||||
/// family <c>OnAlarmProviderModeChanged</c> so connected gateway clients
|
||||
/// can surface the degraded/recovered state.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// Plain class with constructor-assigned get-only properties — not a
|
||||
/// <c>record</c> or <c>init</c>-only type — because the worker
|
||||
/// multi-targets .NET Framework 4.8, which lacks
|
||||
/// <c>System.Runtime.CompilerServices.IsExternalInit</c> (CS0518).
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public sealed class AlarmProviderModeChange : EventArgs
|
||||
{
|
||||
/// <summary>
|
||||
/// Initializes the change event payload.
|
||||
/// </summary>
|
||||
/// <param name="mode">The provider mode now active after the switch.</param>
|
||||
/// <param name="reason">Human-readable reason for the switch.</param>
|
||||
/// <param name="hResult">
|
||||
/// The COM HRESULT that triggered a failover, or 0 for a clean
|
||||
/// failback / no associated HRESULT.
|
||||
/// </param>
|
||||
/// <param name="atUtc">The UTC instant the switch occurred.</param>
|
||||
public AlarmProviderModeChange(AlarmProviderMode mode, string reason, int hResult, DateTime atUtc)
|
||||
{
|
||||
Mode = mode;
|
||||
Reason = reason ?? string.Empty;
|
||||
HResult = hResult;
|
||||
AtUtc = atUtc;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// The provider mode now active after the switch.
|
||||
/// </summary>
|
||||
public AlarmProviderMode Mode { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Human-readable reason for the switch (e.g. the failing COM call, or
|
||||
/// <c>"recovered"</c> for a failback).
|
||||
/// </summary>
|
||||
public string Reason { get; }
|
||||
|
||||
/// <summary>
|
||||
/// The COM HRESULT that triggered a failover, or 0 when none applies.
|
||||
/// </summary>
|
||||
public int HResult { get; }
|
||||
|
||||
/// <summary>
|
||||
/// The UTC instant the switch occurred.
|
||||
/// </summary>
|
||||
public DateTime AtUtc { get; }
|
||||
}
|
||||
@@ -0,0 +1,311 @@
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Runtime.InteropServices;
|
||||
using ZB.MOM.WW.MxGateway.Contracts.Proto;
|
||||
|
||||
namespace ZB.MOM.WW.MxGateway.Worker.MxAccess;
|
||||
|
||||
/// <summary>
|
||||
/// Composite <see cref="IMxAccessAlarmConsumer"/> that owns a PRIMARY
|
||||
/// consumer (the wnwrap <see cref="WnWrapAlarmConsumer"/> alarmmgr source)
|
||||
/// and a STANDBY consumer (the <c>SubtagAlarmConsumer</c> subtag fallback),
|
||||
/// and switches between them automatically:
|
||||
/// <list type="bullet">
|
||||
/// <item><description>
|
||||
/// Auto-fails-over to standby after
|
||||
/// <see cref="FailoverSettings.Threshold"/> consecutive COM
|
||||
/// failures on the primary.
|
||||
/// </description></item>
|
||||
/// <item><description>
|
||||
/// Auto-fails-back to primary after
|
||||
/// <see cref="FailoverSettings.StableProbes"/> consecutive clean
|
||||
/// failback probes against the recovering primary.
|
||||
/// </description></item>
|
||||
/// </list>
|
||||
/// It re-raises <see cref="AlarmTransitionEmitted"/> from whichever child
|
||||
/// is active and raises <see cref="ProviderModeChanged"/> on every switch.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// <strong>Active-child event forwarding.</strong> This type subscribes
|
||||
/// to <em>both</em> children's <see cref="AlarmTransitionEmitted"/>
|
||||
/// events up front and gates re-raising on identity: a child transition
|
||||
/// is forwarded only when its <c>sender</c> is the currently active
|
||||
/// child. The standby is armed (subscribed) from the start so its
|
||||
/// snapshot is warm at the moment of failover, but its transitions stay
|
||||
/// suppressed until it becomes active. Gating-by-active is simpler and
|
||||
/// less error-prone than subscribe/unsubscribe churn on every switch,
|
||||
/// and it avoids a race where a transition fires during the switch.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// <strong>Threading.</strong> Like its children, this type is driven
|
||||
/// entirely on the worker's STA: <see cref="Subscribe"/>,
|
||||
/// <see cref="PollOnce"/>, <see cref="ProbeOnce"/>, and the
|
||||
/// <c>AcknowledgeBy*</c> calls are all invoked from the apartment that
|
||||
/// owns the underlying COM objects. It owns no locks of its own and no
|
||||
/// internal timer; the worker drives <see cref="PollOnce"/> on a timer.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public sealed class FailoverAlarmConsumer : IMxAccessAlarmConsumer
|
||||
{
|
||||
private enum Active
|
||||
{
|
||||
Primary,
|
||||
Standby,
|
||||
}
|
||||
|
||||
private readonly IMxAccessAlarmConsumer primary;
|
||||
private readonly IMxAccessAlarmConsumer standby;
|
||||
private readonly FailoverSettings settings;
|
||||
|
||||
private Active active = Active.Primary;
|
||||
private AlarmProviderMode mode = AlarmProviderMode.Alarmmgr;
|
||||
private int consecutiveFailures;
|
||||
private int cleanProbes;
|
||||
private bool disposed;
|
||||
|
||||
/// <summary>
|
||||
/// Composes the failover consumer over its two children.
|
||||
/// </summary>
|
||||
/// <param name="primary">The PRIMARY (alarmmgr) consumer.</param>
|
||||
/// <param name="standby">The STANDBY (subtag) consumer.</param>
|
||||
/// <param name="settings">The failover/failback tunables.</param>
|
||||
public FailoverAlarmConsumer(
|
||||
IMxAccessAlarmConsumer primary,
|
||||
IMxAccessAlarmConsumer standby,
|
||||
FailoverSettings settings)
|
||||
{
|
||||
this.primary = primary ?? throw new ArgumentNullException(nameof(primary));
|
||||
this.standby = standby ?? throw new ArgumentNullException(nameof(standby));
|
||||
this.settings = settings ?? throw new ArgumentNullException(nameof(settings));
|
||||
|
||||
this.primary.AlarmTransitionEmitted += OnChildTransition;
|
||||
this.standby.AlarmTransitionEmitted += OnChildTransition;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public event EventHandler<MxAlarmTransitionEvent>? AlarmTransitionEmitted;
|
||||
|
||||
/// <summary>
|
||||
/// Fires on every switch between primary and standby. Carries the new
|
||||
/// <see cref="AlarmProviderMode"/>, the reason, the triggering HRESULT
|
||||
/// (0 for a clean failback), and the UTC instant.
|
||||
/// </summary>
|
||||
public event EventHandler<AlarmProviderModeChange>? ProviderModeChanged;
|
||||
|
||||
/// <summary>
|
||||
/// The provider mode currently active.
|
||||
/// </summary>
|
||||
public AlarmProviderMode Mode => mode;
|
||||
|
||||
/// <inheritdoc />
|
||||
/// <remarks>
|
||||
/// Arms BOTH children up front so the standby snapshot is warm at the
|
||||
/// moment of failover. The standby is always subscribed even if the
|
||||
/// primary's <c>Subscribe</c> throws; a standby subscribe failure is
|
||||
/// surfaced (rethrown) but does not count toward primary failover. The
|
||||
/// primary subscribe runs through the failure-counting wrapper so a
|
||||
/// COM failure on subscribe contributes to the failover threshold.
|
||||
/// </remarks>
|
||||
public void Subscribe(string subscription)
|
||||
{
|
||||
if (disposed) throw new ObjectDisposedException(nameof(FailoverAlarmConsumer));
|
||||
|
||||
// Arm the standby first so it is warm regardless of primary outcome.
|
||||
// A standby subscribe failure is a hard fault (the fallback itself is
|
||||
// broken) and is surfaced to the caller; it does not feed the primary
|
||||
// failover counter.
|
||||
standby.Subscribe(subscription);
|
||||
|
||||
// Drive the primary subscribe through the failure-counting wrapper so
|
||||
// a COM failure here counts toward the failover threshold instead of
|
||||
// escaping. Swallowing the exception is deliberate: the standby is
|
||||
// already armed, so a failed primary subscribe just nudges the state
|
||||
// machine toward (or into) standby rather than aborting startup.
|
||||
RunPrimary(() => primary.Subscribe(subscription), "Subscribe");
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
/// <remarks>
|
||||
/// While the primary is active, drives <c>primary.PollOnce</c> through
|
||||
/// the failure-counting wrapper. While degraded (standby active),
|
||||
/// drives <c>standby.PollOnce</c> and then runs one failback probe per
|
||||
/// call via <see cref="ProbeOnce"/> — the worker drives this on a
|
||||
/// timer, so one degraded poll equals one probe tick.
|
||||
/// </remarks>
|
||||
public void PollOnce()
|
||||
{
|
||||
if (disposed) throw new ObjectDisposedException(nameof(FailoverAlarmConsumer));
|
||||
|
||||
if (active == Active.Primary)
|
||||
{
|
||||
RunPrimary(() => primary.PollOnce(), "PollOnce");
|
||||
return;
|
||||
}
|
||||
|
||||
// Degraded: pump the standby for live transitions, then probe the
|
||||
// primary for recovery. Standby PollOnce is a no-op for the subtag
|
||||
// consumer but kept for symmetry / future standby sources.
|
||||
standby.PollOnce();
|
||||
ProbeOnce();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Runs one failback probe against the (presumed recovering) primary.
|
||||
/// Only meaningful while the standby is active; a no-op otherwise.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// A clean probe (primary <c>Subscribe</c> + <c>PollOnce</c> both
|
||||
/// succeed) increments the clean-probe counter and, once it reaches
|
||||
/// <see cref="FailoverSettings.StableProbes"/>, fails back to the
|
||||
/// primary. Any probe failure resets the clean-probe counter to 0 so
|
||||
/// the consumer requires a fresh unbroken run before failing back.
|
||||
/// Exposed publicly so tests (and any external scheduler honoring
|
||||
/// <see cref="FailoverSettings.ProbeIntervalSeconds"/> cadence) can
|
||||
/// drive it directly.
|
||||
/// </remarks>
|
||||
public void ProbeOnce()
|
||||
{
|
||||
if (disposed) throw new ObjectDisposedException(nameof(FailoverAlarmConsumer));
|
||||
if (active != Active.Standby) return;
|
||||
|
||||
try
|
||||
{
|
||||
primary.Subscribe(string.Empty);
|
||||
primary.PollOnce();
|
||||
}
|
||||
catch (Exception)
|
||||
{
|
||||
// Probe failed — the primary is still unhealthy. Demand a fresh
|
||||
// unbroken run of StableProbes clean polls before failing back.
|
||||
cleanProbes = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
cleanProbes++;
|
||||
if (cleanProbes >= settings.StableProbes)
|
||||
{
|
||||
SwitchToPrimary("recovered", 0);
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public int AcknowledgeByGuid(
|
||||
Guid alarmGuid,
|
||||
string ackComment,
|
||||
string ackOperatorName,
|
||||
string ackOperatorNode,
|
||||
string ackOperatorDomain,
|
||||
string ackOperatorFullName)
|
||||
{
|
||||
if (disposed) throw new ObjectDisposedException(nameof(FailoverAlarmConsumer));
|
||||
return ActiveChild.AcknowledgeByGuid(
|
||||
alarmGuid, ackComment, ackOperatorName, ackOperatorNode, ackOperatorDomain, ackOperatorFullName);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public int AcknowledgeByName(
|
||||
string alarmName,
|
||||
string providerName,
|
||||
string groupName,
|
||||
string ackComment,
|
||||
string ackOperatorName,
|
||||
string ackOperatorNode,
|
||||
string ackOperatorDomain,
|
||||
string ackOperatorFullName)
|
||||
{
|
||||
if (disposed) throw new ObjectDisposedException(nameof(FailoverAlarmConsumer));
|
||||
return ActiveChild.AcknowledgeByName(
|
||||
alarmName, providerName, groupName, ackComment,
|
||||
ackOperatorName, ackOperatorNode, ackOperatorDomain, ackOperatorFullName);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public IReadOnlyList<MxAlarmSnapshotRecord> SnapshotActiveAlarms()
|
||||
{
|
||||
if (disposed) throw new ObjectDisposedException(nameof(FailoverAlarmConsumer));
|
||||
return ActiveChild.SnapshotActiveAlarms();
|
||||
}
|
||||
|
||||
private IMxAccessAlarmConsumer ActiveChild => active == Active.Primary ? primary : standby;
|
||||
|
||||
/// <summary>
|
||||
/// Runs a primary COM action, counting consecutive failures. A
|
||||
/// <see cref="COMException"/> (or any exception, treated as a COM
|
||||
/// failure) increments the failure counter and, at
|
||||
/// <see cref="FailoverSettings.Threshold"/> while the primary is still
|
||||
/// active, switches to the standby. A success resets the counter.
|
||||
/// </summary>
|
||||
private void RunPrimary(Action action, string operation)
|
||||
{
|
||||
try
|
||||
{
|
||||
action();
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
consecutiveFailures++;
|
||||
int hresult = ex is COMException ? ex.HResult : 0;
|
||||
if (active == Active.Primary && consecutiveFailures >= settings.Threshold)
|
||||
{
|
||||
SwitchToStandby($"primary {operation} failed", hresult);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
consecutiveFailures = 0;
|
||||
}
|
||||
|
||||
private void SwitchToStandby(string reason, int hresult)
|
||||
{
|
||||
active = Active.Standby;
|
||||
mode = AlarmProviderMode.Subtag;
|
||||
consecutiveFailures = 0;
|
||||
cleanProbes = 0;
|
||||
|
||||
// Warm the standby snapshot for the gateway hand-off. The gateway
|
||||
// reconciles state from this snapshot, so the return value is not
|
||||
// consumed here — the call exists for its priming side effect.
|
||||
_ = standby.SnapshotActiveAlarms();
|
||||
|
||||
RaiseModeChanged(AlarmProviderMode.Subtag, reason, hresult);
|
||||
}
|
||||
|
||||
private void SwitchToPrimary(string reason, int hresult)
|
||||
{
|
||||
active = Active.Primary;
|
||||
mode = AlarmProviderMode.Alarmmgr;
|
||||
consecutiveFailures = 0;
|
||||
cleanProbes = 0;
|
||||
RaiseModeChanged(AlarmProviderMode.Alarmmgr, reason, hresult);
|
||||
}
|
||||
|
||||
private void RaiseModeChanged(AlarmProviderMode newMode, string reason, int hresult)
|
||||
{
|
||||
ProviderModeChanged?.Invoke(
|
||||
this,
|
||||
new AlarmProviderModeChange(newMode, reason, hresult, DateTime.UtcNow));
|
||||
}
|
||||
|
||||
private void OnChildTransition(object? sender, MxAlarmTransitionEvent e)
|
||||
{
|
||||
// Gate by active child: forward only the active source's transitions.
|
||||
if (ReferenceEquals(sender, ActiveChild))
|
||||
{
|
||||
AlarmTransitionEmitted?.Invoke(this, e);
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public void Dispose()
|
||||
{
|
||||
if (disposed) return;
|
||||
disposed = true;
|
||||
|
||||
primary.AlarmTransitionEmitted -= OnChildTransition;
|
||||
standby.AlarmTransitionEmitted -= OnChildTransition;
|
||||
|
||||
primary.Dispose();
|
||||
standby.Dispose();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,57 @@
|
||||
namespace ZB.MOM.WW.MxGateway.Worker.MxAccess;
|
||||
|
||||
/// <summary>
|
||||
/// Tunables for <see cref="FailoverAlarmConsumer"/>'s auto-failover /
|
||||
/// auto-failback state machine. Constructor-clamped to safe minimums so a
|
||||
/// misconfigured options bind can never produce a zero/negative threshold
|
||||
/// that would either never fail over or fail over on the first hiccup
|
||||
/// unintentionally.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// Plain class with constructor-assigned get-only properties — not a
|
||||
/// <c>record</c> or <c>init</c>-only type — because the worker
|
||||
/// multi-targets .NET Framework 4.8, which lacks
|
||||
/// <c>System.Runtime.CompilerServices.IsExternalInit</c> and so cannot
|
||||
/// compile <c>init</c> accessors or positional records (CS0518).
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public sealed class FailoverSettings
|
||||
{
|
||||
/// <summary>
|
||||
/// Initializes the settings, clamping each value to its safe minimum.
|
||||
/// </summary>
|
||||
/// <param name="threshold">
|
||||
/// Consecutive primary COM failures that trigger a switch to standby.
|
||||
/// Clamped to a minimum of 1.
|
||||
/// </param>
|
||||
/// <param name="probeIntervalSeconds">
|
||||
/// Minimum spacing (seconds) between failback probes against the
|
||||
/// recovering primary. Clamped to a minimum of 0 (probe every tick).
|
||||
/// </param>
|
||||
/// <param name="stableProbes">
|
||||
/// Consecutive clean failback probes required before switching back to
|
||||
/// the primary. Clamped to a minimum of 1.
|
||||
/// </param>
|
||||
public FailoverSettings(int threshold, int probeIntervalSeconds, int stableProbes)
|
||||
{
|
||||
Threshold = threshold < 1 ? 1 : threshold;
|
||||
ProbeIntervalSeconds = probeIntervalSeconds < 0 ? 0 : probeIntervalSeconds;
|
||||
StableProbes = stableProbes < 1 ? 1 : stableProbes;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Consecutive primary COM failures that trigger a switch to standby.
|
||||
/// </summary>
|
||||
public int Threshold { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Minimum spacing, in seconds, between failback probes.
|
||||
/// </summary>
|
||||
public int ProbeIntervalSeconds { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Consecutive clean failback probes required before failing back.
|
||||
/// </summary>
|
||||
public int StableProbes { get; }
|
||||
}
|
||||
Reference in New Issue
Block a user