using System;
using System.Collections.Generic;
using System.Runtime.InteropServices;
using ZB.MOM.WW.MxGateway.Contracts.Proto;
namespace ZB.MOM.WW.MxGateway.Worker.MxAccess;
///
/// Composite that owns a PRIMARY
/// consumer (the wnwrap alarmmgr source)
/// and a STANDBY consumer (the SubtagAlarmConsumer subtag fallback),
/// and switches between them automatically:
///
/// -
/// Auto-fails-over to standby after
/// consecutive COM
/// failures on the primary.
///
/// -
/// Auto-fails-back to primary after
/// consecutive clean
/// failback probes against the recovering primary.
///
///
/// It re-raises from whichever child
/// is active and raises on every switch.
///
///
///
/// Active-child event forwarding. This type subscribes
/// to both children's
/// events up front and gates re-raising on identity: a child transition
/// is forwarded only when its sender is the currently active
/// child. The standby is armed (subscribed) from the start so its
/// snapshot is warm at the moment of failover, but its transitions stay
/// suppressed until it becomes active. Gating-by-active is simpler and
/// less error-prone than subscribe/unsubscribe churn on every switch,
/// and it avoids a race where a transition fires during the switch.
///
///
/// Threading. Like its children, this type is driven
/// entirely on the worker's STA: ,
/// , , and the
/// AcknowledgeBy* calls are all invoked from the apartment that
/// owns the underlying COM objects. It owns no locks of its own and no
/// internal timer; the worker drives on a timer.
///
///
public sealed class FailoverAlarmConsumer : IMxAccessAlarmConsumer
{
private enum Active
{
Primary,
Standby,
}
private readonly IMxAccessAlarmConsumer primary;
private readonly IMxAccessAlarmConsumer standby;
private readonly FailoverSettings settings;
private Active active = Active.Primary;
private AlarmProviderMode mode = AlarmProviderMode.Alarmmgr;
private int consecutiveFailures;
private int cleanProbes;
private bool disposed;
private DateTime lastProbeAtUtc = DateTime.MinValue;
///
/// Composes the failover consumer over its two children.
///
/// The PRIMARY (alarmmgr) consumer.
/// The STANDBY (subtag) consumer.
/// The failover/failback tunables.
public FailoverAlarmConsumer(
IMxAccessAlarmConsumer primary,
IMxAccessAlarmConsumer standby,
FailoverSettings settings)
{
this.primary = primary ?? throw new ArgumentNullException(nameof(primary));
this.standby = standby ?? throw new ArgumentNullException(nameof(standby));
this.settings = settings ?? throw new ArgumentNullException(nameof(settings));
this.primary.AlarmTransitionEmitted += OnChildTransition;
this.standby.AlarmTransitionEmitted += OnChildTransition;
}
///
public event EventHandler? AlarmTransitionEmitted;
///
/// Fires on every switch between primary and standby. Carries the new
/// , the reason, the triggering HRESULT
/// (0 for a clean failback), and the UTC instant.
///
public event EventHandler? ProviderModeChanged;
///
/// The provider mode currently active.
///
public AlarmProviderMode Mode => mode;
///
///
/// Arms BOTH children up front so the standby snapshot is warm at the
/// moment of failover. The standby is always subscribed even if the
/// primary's Subscribe throws; a standby subscribe failure is
/// surfaced (rethrown) but does not count toward primary failover. The
/// primary subscribe runs through the failure-counting wrapper so a
/// COM failure on subscribe contributes to the failover threshold.
///
public void Subscribe(string subscription)
{
if (disposed) throw new ObjectDisposedException(nameof(FailoverAlarmConsumer));
// The primary is not torn down on failover and is therefore never
// re-subscribed during ProbeOnce, so the subscription expression does
// not need to be retained here.
// Arm the standby first so it is warm regardless of primary outcome.
// A standby subscribe failure is a hard fault (the fallback itself is
// broken) and is surfaced to the caller; it does not feed the primary
// failover counter.
standby.Subscribe(subscription);
// Drive the primary subscribe through the failure-counting wrapper so
// a COM failure here counts toward the failover threshold instead of
// escaping. Swallowing the exception is deliberate: the standby is
// already armed, so a failed primary subscribe just nudges the state
// machine toward (or into) standby rather than aborting startup.
RunPrimary(() => primary.Subscribe(subscription), "Subscribe");
}
///
///
/// While the primary is active, drives primary.PollOnce through
/// the failure-counting wrapper. While degraded (standby active),
/// drives standby.PollOnce and then runs one failback probe per
/// call via — the worker drives this on a
/// timer, so one degraded poll equals one probe tick.
///
public void PollOnce()
{
if (disposed) throw new ObjectDisposedException(nameof(FailoverAlarmConsumer));
if (active == Active.Primary)
{
RunPrimary(() => primary.PollOnce(), "PollOnce");
return;
}
// Degraded: pump the standby for live transitions, then probe the
// primary for recovery. Standby PollOnce is a no-op for the subtag
// consumer but kept for symmetry / future standby sources.
standby.PollOnce();
ProbeOnce();
}
///
/// Runs one failback probe against the (presumed recovering) primary.
/// Only meaningful while the standby is active; a no-op otherwise.
///
///
///
/// A clean probe (primary PollOnce succeeds without
/// throwing) increments the clean-probe counter and, once it reaches
/// , fails back to the
/// primary. Any probe failure resets the clean-probe counter to 0 so
/// the consumer requires a fresh unbroken run before failing back.
/// Exposed publicly so tests (and any external scheduler honoring
/// cadence) can
/// drive it directly.
///
///
/// Probe throttle. When
/// is greater than
/// zero, successive calls to this method are throttled: a probe is
/// skipped unless at least that many seconds have elapsed since the
/// last probe that was actually executed. When
/// is zero, the
/// throttle is disabled and every call probes immediately (the default
/// used by unit tests).
///
///
/// Why PollOnce only — no re-Subscribe.
/// Failover does NOT tear down the primary's subscription;
/// is single-subscribe and would
/// throw on a second call.
/// The probe therefore re-polls the still-subscribed primary:
/// when the underlying COM provider recovers, PollOnce stops
/// throwing and clean probes accumulate toward failback. This covers
/// the dominant failure mode (transient COM/provider fault after a
/// successful initial subscribe).
///
///
/// Known v1 limitation. If the original
/// Subscribe itself failed (i.e., the primary never reached a
/// subscribed state — only reachable when
/// is 1), polling alone
/// cannot re-establish the subscription. That edge case is accepted
/// for v1: the operator must restart the session to force a fresh
/// subscribe attempt.
///
///
public void ProbeOnce()
{
if (disposed) throw new ObjectDisposedException(nameof(FailoverAlarmConsumer));
if (active != Active.Standby) return;
// Throttle probes to the configured cadence. When ProbeIntervalSeconds
// is 0 the throttle is disabled and every call probes immediately.
if (settings.ProbeIntervalSeconds > 0
&& (DateTime.UtcNow - lastProbeAtUtc).TotalSeconds < settings.ProbeIntervalSeconds)
{
return;
}
lastProbeAtUtc = DateTime.UtcNow;
try
{
// Re-poll the still-subscribed primary. Do NOT call Subscribe —
// WnWrapAlarmConsumer is single-subscribe and the primary remains
// subscribed across the failover; calling Subscribe again would
// always throw InvalidOperationException and prevent failback.
primary.PollOnce();
}
catch (Exception)
{
// Probe failed — the primary is still unhealthy. Demand a fresh
// unbroken run of StableProbes clean polls before failing back.
cleanProbes = 0;
return;
}
cleanProbes++;
if (cleanProbes >= settings.StableProbes)
{
SwitchToPrimary("recovered", 0);
}
}
///
public int AcknowledgeByGuid(
Guid alarmGuid,
string ackComment,
string ackOperatorName,
string ackOperatorNode,
string ackOperatorDomain,
string ackOperatorFullName)
{
if (disposed) throw new ObjectDisposedException(nameof(FailoverAlarmConsumer));
return ActiveChild.AcknowledgeByGuid(
alarmGuid, ackComment, ackOperatorName, ackOperatorNode, ackOperatorDomain, ackOperatorFullName);
}
///
public int AcknowledgeByName(
string alarmName,
string providerName,
string groupName,
string ackComment,
string ackOperatorName,
string ackOperatorNode,
string ackOperatorDomain,
string ackOperatorFullName)
{
if (disposed) throw new ObjectDisposedException(nameof(FailoverAlarmConsumer));
return ActiveChild.AcknowledgeByName(
alarmName, providerName, groupName, ackComment,
ackOperatorName, ackOperatorNode, ackOperatorDomain, ackOperatorFullName);
}
///
public IReadOnlyList SnapshotActiveAlarms()
{
if (disposed) throw new ObjectDisposedException(nameof(FailoverAlarmConsumer));
return ActiveChild.SnapshotActiveAlarms();
}
private IMxAccessAlarmConsumer ActiveChild => active == Active.Primary ? primary : standby;
///
/// Runs a primary COM action, counting consecutive failures. A
/// (or any exception, treated as a COM
/// failure) increments the failure counter and, at
/// while the primary is still
/// active, switches to the standby. A success resets the counter.
///
private void RunPrimary(Action action, string operation)
{
try
{
action();
}
catch (Exception ex) when (ex is not OutOfMemoryException)
{
consecutiveFailures++;
int hresult = ex is COMException ? ex.HResult : 0;
if (active == Active.Primary && consecutiveFailures >= settings.Threshold)
{
SwitchToStandby($"primary {operation} failed", hresult);
}
return;
}
consecutiveFailures = 0;
}
private void SwitchToStandby(string reason, int hresult)
{
active = Active.Standby;
mode = AlarmProviderMode.Subtag;
consecutiveFailures = 0;
cleanProbes = 0;
// Warm the standby snapshot for the gateway hand-off. The gateway
// reconciles state from this snapshot, so the return value is not
// consumed here — the call exists for its priming side effect.
_ = standby.SnapshotActiveAlarms();
RaiseModeChanged(AlarmProviderMode.Subtag, reason, hresult);
}
private void SwitchToPrimary(string reason, int hresult)
{
active = Active.Primary;
mode = AlarmProviderMode.Alarmmgr;
consecutiveFailures = 0;
cleanProbes = 0;
RaiseModeChanged(AlarmProviderMode.Alarmmgr, reason, hresult);
}
private void RaiseModeChanged(AlarmProviderMode newMode, string reason, int hresult)
{
ProviderModeChanged?.Invoke(
this,
new AlarmProviderModeChange(newMode, reason, hresult, DateTime.UtcNow));
}
private void OnChildTransition(object? sender, MxAlarmTransitionEvent e)
{
// Gate by active child: forward only the active source's transitions.
if (ReferenceEquals(sender, ActiveChild))
{
AlarmTransitionEmitted?.Invoke(this, e);
}
}
///
public void Dispose()
{
if (disposed) return;
disposed = true;
primary.AlarmTransitionEmitted -= OnChildTransition;
standby.AlarmTransitionEmitted -= OnChildTransition;
primary.Dispose();
standby.Dispose();
}
}