a3752799de
B4: the field was stored in Subscribe but never read — the primary is never re-subscribed during probing. Drop it and keep the rationale as a comment.
362 lines
15 KiB
C#
362 lines
15 KiB
C#
using System;
|
|
using System.Collections.Generic;
|
|
using System.Runtime.InteropServices;
|
|
using ZB.MOM.WW.MxGateway.Contracts.Proto;
|
|
|
|
namespace ZB.MOM.WW.MxGateway.Worker.MxAccess;
|
|
|
|
/// <summary>
|
|
/// Composite <see cref="IMxAccessAlarmConsumer"/> that owns a PRIMARY
|
|
/// consumer (the wnwrap <see cref="WnWrapAlarmConsumer"/> alarmmgr source)
|
|
/// and a STANDBY consumer (the <c>SubtagAlarmConsumer</c> subtag fallback),
|
|
/// and switches between them automatically:
|
|
/// <list type="bullet">
|
|
/// <item><description>
|
|
/// Auto-fails-over to standby after
|
|
/// <see cref="FailoverSettings.Threshold"/> consecutive COM
|
|
/// failures on the primary.
|
|
/// </description></item>
|
|
/// <item><description>
|
|
/// Auto-fails-back to primary after
|
|
/// <see cref="FailoverSettings.StableProbes"/> consecutive clean
|
|
/// failback probes against the recovering primary.
|
|
/// </description></item>
|
|
/// </list>
|
|
/// It re-raises <see cref="AlarmTransitionEmitted"/> from whichever child
|
|
/// is active and raises <see cref="ProviderModeChanged"/> on every switch.
|
|
/// </summary>
|
|
/// <remarks>
|
|
/// <para>
|
|
/// <strong>Active-child event forwarding.</strong> This type subscribes
|
|
/// to <em>both</em> children's <see cref="AlarmTransitionEmitted"/>
|
|
/// events up front and gates re-raising on identity: a child transition
|
|
/// is forwarded only when its <c>sender</c> is the currently active
|
|
/// child. The standby is armed (subscribed) from the start so its
|
|
/// snapshot is warm at the moment of failover, but its transitions stay
|
|
/// suppressed until it becomes active. Gating-by-active is simpler and
|
|
/// less error-prone than subscribe/unsubscribe churn on every switch,
|
|
/// and it avoids a race where a transition fires during the switch.
|
|
/// </para>
|
|
/// <para>
|
|
/// <strong>Threading.</strong> Like its children, this type is driven
|
|
/// entirely on the worker's STA: <see cref="Subscribe"/>,
|
|
/// <see cref="PollOnce"/>, <see cref="ProbeOnce"/>, and the
|
|
/// <c>AcknowledgeBy*</c> calls are all invoked from the apartment that
|
|
/// owns the underlying COM objects. It owns no locks of its own and no
|
|
/// internal timer; the worker drives <see cref="PollOnce"/> on a timer.
|
|
/// </para>
|
|
/// </remarks>
|
|
public sealed class FailoverAlarmConsumer : IMxAccessAlarmConsumer
|
|
{
|
|
private enum Active
|
|
{
|
|
Primary,
|
|
Standby,
|
|
}
|
|
|
|
private readonly IMxAccessAlarmConsumer primary;
|
|
private readonly IMxAccessAlarmConsumer standby;
|
|
private readonly FailoverSettings settings;
|
|
|
|
private Active active = Active.Primary;
|
|
private AlarmProviderMode mode = AlarmProviderMode.Alarmmgr;
|
|
private int consecutiveFailures;
|
|
private int cleanProbes;
|
|
private bool disposed;
|
|
private DateTime lastProbeAtUtc = DateTime.MinValue;
|
|
|
|
/// <summary>
|
|
/// Composes the failover consumer over its two children.
|
|
/// </summary>
|
|
/// <param name="primary">The PRIMARY (alarmmgr) consumer.</param>
|
|
/// <param name="standby">The STANDBY (subtag) consumer.</param>
|
|
/// <param name="settings">The failover/failback tunables.</param>
|
|
public FailoverAlarmConsumer(
|
|
IMxAccessAlarmConsumer primary,
|
|
IMxAccessAlarmConsumer standby,
|
|
FailoverSettings settings)
|
|
{
|
|
this.primary = primary ?? throw new ArgumentNullException(nameof(primary));
|
|
this.standby = standby ?? throw new ArgumentNullException(nameof(standby));
|
|
this.settings = settings ?? throw new ArgumentNullException(nameof(settings));
|
|
|
|
this.primary.AlarmTransitionEmitted += OnChildTransition;
|
|
this.standby.AlarmTransitionEmitted += OnChildTransition;
|
|
}
|
|
|
|
/// <inheritdoc />
|
|
public event EventHandler<MxAlarmTransitionEvent>? AlarmTransitionEmitted;
|
|
|
|
/// <summary>
|
|
/// Fires on every switch between primary and standby. Carries the new
|
|
/// <see cref="AlarmProviderMode"/>, the reason, the triggering HRESULT
|
|
/// (0 for a clean failback), and the UTC instant.
|
|
/// </summary>
|
|
public event EventHandler<AlarmProviderModeChange>? ProviderModeChanged;
|
|
|
|
/// <summary>
|
|
/// The provider mode currently active.
|
|
/// </summary>
|
|
public AlarmProviderMode Mode => mode;
|
|
|
|
/// <inheritdoc />
|
|
/// <remarks>
|
|
/// Arms BOTH children up front so the standby snapshot is warm at the
|
|
/// moment of failover. The standby is always subscribed even if the
|
|
/// primary's <c>Subscribe</c> throws; a standby subscribe failure is
|
|
/// surfaced (rethrown) but does not count toward primary failover. The
|
|
/// primary subscribe runs through the failure-counting wrapper so a
|
|
/// COM failure on subscribe contributes to the failover threshold.
|
|
/// </remarks>
|
|
public void Subscribe(string subscription)
|
|
{
|
|
if (disposed) throw new ObjectDisposedException(nameof(FailoverAlarmConsumer));
|
|
|
|
// The primary is not torn down on failover and is therefore never
|
|
// re-subscribed during ProbeOnce, so the subscription expression does
|
|
// not need to be retained here.
|
|
|
|
// Arm the standby first so it is warm regardless of primary outcome.
|
|
// A standby subscribe failure is a hard fault (the fallback itself is
|
|
// broken) and is surfaced to the caller; it does not feed the primary
|
|
// failover counter.
|
|
standby.Subscribe(subscription);
|
|
|
|
// Drive the primary subscribe through the failure-counting wrapper so
|
|
// a COM failure here counts toward the failover threshold instead of
|
|
// escaping. Swallowing the exception is deliberate: the standby is
|
|
// already armed, so a failed primary subscribe just nudges the state
|
|
// machine toward (or into) standby rather than aborting startup.
|
|
RunPrimary(() => primary.Subscribe(subscription), "Subscribe");
|
|
}
|
|
|
|
/// <inheritdoc />
|
|
/// <remarks>
|
|
/// While the primary is active, drives <c>primary.PollOnce</c> through
|
|
/// the failure-counting wrapper. While degraded (standby active),
|
|
/// drives <c>standby.PollOnce</c> and then runs one failback probe per
|
|
/// call via <see cref="ProbeOnce"/> — the worker drives this on a
|
|
/// timer, so one degraded poll equals one probe tick.
|
|
/// </remarks>
|
|
public void PollOnce()
|
|
{
|
|
if (disposed) throw new ObjectDisposedException(nameof(FailoverAlarmConsumer));
|
|
|
|
if (active == Active.Primary)
|
|
{
|
|
RunPrimary(() => primary.PollOnce(), "PollOnce");
|
|
return;
|
|
}
|
|
|
|
// Degraded: pump the standby for live transitions, then probe the
|
|
// primary for recovery. Standby PollOnce is a no-op for the subtag
|
|
// consumer but kept for symmetry / future standby sources.
|
|
standby.PollOnce();
|
|
ProbeOnce();
|
|
}
|
|
|
|
/// <summary>
|
|
/// Runs one failback probe against the (presumed recovering) primary.
|
|
/// Only meaningful while the standby is active; a no-op otherwise.
|
|
/// </summary>
|
|
/// <remarks>
|
|
/// <para>
|
|
/// A clean probe (primary <c>PollOnce</c> succeeds without
|
|
/// throwing) increments the clean-probe counter and, once it reaches
|
|
/// <see cref="FailoverSettings.StableProbes"/>, fails back to the
|
|
/// primary. Any probe failure resets the clean-probe counter to 0 so
|
|
/// the consumer requires a fresh unbroken run before failing back.
|
|
/// Exposed publicly so tests (and any external scheduler honoring
|
|
/// <see cref="FailoverSettings.ProbeIntervalSeconds"/> cadence) can
|
|
/// drive it directly.
|
|
/// </para>
|
|
/// <para>
|
|
/// <strong>Probe throttle.</strong> When
|
|
/// <see cref="FailoverSettings.ProbeIntervalSeconds"/> is greater than
|
|
/// zero, successive calls to this method are throttled: a probe is
|
|
/// skipped unless at least that many seconds have elapsed since the
|
|
/// last probe that was actually executed. When
|
|
/// <see cref="FailoverSettings.ProbeIntervalSeconds"/> is zero, the
|
|
/// throttle is disabled and every call probes immediately (the default
|
|
/// used by unit tests).
|
|
/// </para>
|
|
/// <para>
|
|
/// <strong>Why PollOnce only — no re-Subscribe.</strong>
|
|
/// Failover does NOT tear down the primary's subscription;
|
|
/// <see cref="WnWrapAlarmConsumer"/> is single-subscribe and would
|
|
/// throw <see cref="InvalidOperationException"/> on a second call.
|
|
/// The probe therefore re-polls the still-subscribed primary:
|
|
/// when the underlying COM provider recovers, <c>PollOnce</c> stops
|
|
/// throwing and clean probes accumulate toward failback. This covers
|
|
/// the dominant failure mode (transient COM/provider fault after a
|
|
/// successful initial subscribe).
|
|
/// </para>
|
|
/// <para>
|
|
/// <strong>Known v1 limitation.</strong> If the <em>original</em>
|
|
/// <c>Subscribe</c> itself failed (i.e., the primary never reached a
|
|
/// subscribed state — only reachable when
|
|
/// <see cref="FailoverSettings.Threshold"/> is 1), polling alone
|
|
/// cannot re-establish the subscription. That edge case is accepted
|
|
/// for v1: the operator must restart the session to force a fresh
|
|
/// subscribe attempt.
|
|
/// </para>
|
|
/// </remarks>
|
|
public void ProbeOnce()
|
|
{
|
|
if (disposed) throw new ObjectDisposedException(nameof(FailoverAlarmConsumer));
|
|
if (active != Active.Standby) return;
|
|
|
|
// Throttle probes to the configured cadence. When ProbeIntervalSeconds
|
|
// is 0 the throttle is disabled and every call probes immediately.
|
|
if (settings.ProbeIntervalSeconds > 0
|
|
&& (DateTime.UtcNow - lastProbeAtUtc).TotalSeconds < settings.ProbeIntervalSeconds)
|
|
{
|
|
return;
|
|
}
|
|
|
|
lastProbeAtUtc = DateTime.UtcNow;
|
|
|
|
try
|
|
{
|
|
// Re-poll the still-subscribed primary. Do NOT call Subscribe —
|
|
// WnWrapAlarmConsumer is single-subscribe and the primary remains
|
|
// subscribed across the failover; calling Subscribe again would
|
|
// always throw InvalidOperationException and prevent failback.
|
|
primary.PollOnce();
|
|
}
|
|
catch (Exception)
|
|
{
|
|
// Probe failed — the primary is still unhealthy. Demand a fresh
|
|
// unbroken run of StableProbes clean polls before failing back.
|
|
cleanProbes = 0;
|
|
return;
|
|
}
|
|
|
|
cleanProbes++;
|
|
if (cleanProbes >= settings.StableProbes)
|
|
{
|
|
SwitchToPrimary("recovered", 0);
|
|
}
|
|
}
|
|
|
|
/// <inheritdoc />
|
|
public int AcknowledgeByGuid(
|
|
Guid alarmGuid,
|
|
string ackComment,
|
|
string ackOperatorName,
|
|
string ackOperatorNode,
|
|
string ackOperatorDomain,
|
|
string ackOperatorFullName)
|
|
{
|
|
if (disposed) throw new ObjectDisposedException(nameof(FailoverAlarmConsumer));
|
|
return ActiveChild.AcknowledgeByGuid(
|
|
alarmGuid, ackComment, ackOperatorName, ackOperatorNode, ackOperatorDomain, ackOperatorFullName);
|
|
}
|
|
|
|
/// <inheritdoc />
|
|
public int AcknowledgeByName(
|
|
string alarmName,
|
|
string providerName,
|
|
string groupName,
|
|
string ackComment,
|
|
string ackOperatorName,
|
|
string ackOperatorNode,
|
|
string ackOperatorDomain,
|
|
string ackOperatorFullName)
|
|
{
|
|
if (disposed) throw new ObjectDisposedException(nameof(FailoverAlarmConsumer));
|
|
return ActiveChild.AcknowledgeByName(
|
|
alarmName, providerName, groupName, ackComment,
|
|
ackOperatorName, ackOperatorNode, ackOperatorDomain, ackOperatorFullName);
|
|
}
|
|
|
|
/// <inheritdoc />
|
|
public IReadOnlyList<MxAlarmSnapshotRecord> SnapshotActiveAlarms()
|
|
{
|
|
if (disposed) throw new ObjectDisposedException(nameof(FailoverAlarmConsumer));
|
|
return ActiveChild.SnapshotActiveAlarms();
|
|
}
|
|
|
|
private IMxAccessAlarmConsumer ActiveChild => active == Active.Primary ? primary : standby;
|
|
|
|
/// <summary>
|
|
/// Runs a primary COM action, counting consecutive failures. A
|
|
/// <see cref="COMException"/> (or any exception, treated as a COM
|
|
/// failure) increments the failure counter and, at
|
|
/// <see cref="FailoverSettings.Threshold"/> while the primary is still
|
|
/// active, switches to the standby. A success resets the counter.
|
|
/// </summary>
|
|
private void RunPrimary(Action action, string operation)
|
|
{
|
|
try
|
|
{
|
|
action();
|
|
}
|
|
catch (Exception ex) when (ex is not OutOfMemoryException)
|
|
{
|
|
consecutiveFailures++;
|
|
int hresult = ex is COMException ? ex.HResult : 0;
|
|
if (active == Active.Primary && consecutiveFailures >= settings.Threshold)
|
|
{
|
|
SwitchToStandby($"primary {operation} failed", hresult);
|
|
}
|
|
return;
|
|
}
|
|
|
|
consecutiveFailures = 0;
|
|
}
|
|
|
|
private void SwitchToStandby(string reason, int hresult)
|
|
{
|
|
active = Active.Standby;
|
|
mode = AlarmProviderMode.Subtag;
|
|
consecutiveFailures = 0;
|
|
cleanProbes = 0;
|
|
|
|
// Warm the standby snapshot for the gateway hand-off. The gateway
|
|
// reconciles state from this snapshot, so the return value is not
|
|
// consumed here — the call exists for its priming side effect.
|
|
_ = standby.SnapshotActiveAlarms();
|
|
|
|
RaiseModeChanged(AlarmProviderMode.Subtag, reason, hresult);
|
|
}
|
|
|
|
private void SwitchToPrimary(string reason, int hresult)
|
|
{
|
|
active = Active.Primary;
|
|
mode = AlarmProviderMode.Alarmmgr;
|
|
consecutiveFailures = 0;
|
|
cleanProbes = 0;
|
|
RaiseModeChanged(AlarmProviderMode.Alarmmgr, reason, hresult);
|
|
}
|
|
|
|
private void RaiseModeChanged(AlarmProviderMode newMode, string reason, int hresult)
|
|
{
|
|
ProviderModeChanged?.Invoke(
|
|
this,
|
|
new AlarmProviderModeChange(newMode, reason, hresult, DateTime.UtcNow));
|
|
}
|
|
|
|
private void OnChildTransition(object? sender, MxAlarmTransitionEvent e)
|
|
{
|
|
// Gate by active child: forward only the active source's transitions.
|
|
if (ReferenceEquals(sender, ActiveChild))
|
|
{
|
|
AlarmTransitionEmitted?.Invoke(this, e);
|
|
}
|
|
}
|
|
|
|
/// <inheritdoc />
|
|
public void Dispose()
|
|
{
|
|
if (disposed) return;
|
|
disposed = true;
|
|
|
|
primary.AlarmTransitionEmitted -= OnChildTransition;
|
|
standby.AlarmTransitionEmitted -= OnChildTransition;
|
|
|
|
primary.Dispose();
|
|
standby.Dispose();
|
|
}
|
|
}
|