using System; using System.Collections.Generic; using System.Runtime.InteropServices; using ZB.MOM.WW.MxGateway.Contracts.Proto; namespace ZB.MOM.WW.MxGateway.Worker.MxAccess; /// /// Composite that owns a PRIMARY /// consumer (the wnwrap alarmmgr source) /// and a STANDBY consumer (the SubtagAlarmConsumer subtag fallback), /// and switches between them automatically: /// /// /// Auto-fails-over to standby after /// consecutive COM /// failures on the primary. /// /// /// Auto-fails-back to primary after /// consecutive clean /// failback probes against the recovering primary. /// /// /// It re-raises from whichever child /// is active and raises on every switch. /// /// /// /// Active-child event forwarding. This type subscribes /// to both children's /// events up front and gates re-raising on identity: a child transition /// is forwarded only when its sender is the currently active /// child. The standby is armed (subscribed) from the start so its /// snapshot is warm at the moment of failover, but its transitions stay /// suppressed until it becomes active. Gating-by-active is simpler and /// less error-prone than subscribe/unsubscribe churn on every switch, /// and it avoids a race where a transition fires during the switch. /// /// /// Threading. Like its children, this type is driven /// entirely on the worker's STA: , /// , , and the /// AcknowledgeBy* calls are all invoked from the apartment that /// owns the underlying COM objects. It owns no locks of its own and no /// internal timer; the worker drives on a timer. /// /// public sealed class FailoverAlarmConsumer : IMxAccessAlarmConsumer { private enum Active { Primary, Standby, } private readonly IMxAccessAlarmConsumer primary; private readonly IMxAccessAlarmConsumer standby; private readonly FailoverSettings settings; private Active active = Active.Primary; private AlarmProviderMode mode = AlarmProviderMode.Alarmmgr; private int consecutiveFailures; private int cleanProbes; private bool disposed; private DateTime lastProbeAtUtc = DateTime.MinValue; /// /// Composes the failover consumer over its two children. /// /// The PRIMARY (alarmmgr) consumer. /// The STANDBY (subtag) consumer. /// The failover/failback tunables. public FailoverAlarmConsumer( IMxAccessAlarmConsumer primary, IMxAccessAlarmConsumer standby, FailoverSettings settings) { this.primary = primary ?? throw new ArgumentNullException(nameof(primary)); this.standby = standby ?? throw new ArgumentNullException(nameof(standby)); this.settings = settings ?? throw new ArgumentNullException(nameof(settings)); this.primary.AlarmTransitionEmitted += OnChildTransition; this.standby.AlarmTransitionEmitted += OnChildTransition; } /// public event EventHandler? AlarmTransitionEmitted; /// /// Fires on every switch between primary and standby. Carries the new /// , the reason, the triggering HRESULT /// (0 for a clean failback), and the UTC instant. /// public event EventHandler? ProviderModeChanged; /// /// The provider mode currently active. /// public AlarmProviderMode Mode => mode; /// /// /// Arms BOTH children up front so the standby snapshot is warm at the /// moment of failover. The standby is always subscribed even if the /// primary's Subscribe throws; a standby subscribe failure is /// surfaced (rethrown) but does not count toward primary failover. The /// primary subscribe runs through the failure-counting wrapper so a /// COM failure on subscribe contributes to the failover threshold. /// public void Subscribe(string subscription) { if (disposed) throw new ObjectDisposedException(nameof(FailoverAlarmConsumer)); // The primary is not torn down on failover and is therefore never // re-subscribed during ProbeOnce, so the subscription expression does // not need to be retained here. // Arm the standby first so it is warm regardless of primary outcome. // A standby subscribe failure is a hard fault (the fallback itself is // broken) and is surfaced to the caller; it does not feed the primary // failover counter. standby.Subscribe(subscription); // Drive the primary subscribe through the failure-counting wrapper so // a COM failure here counts toward the failover threshold instead of // escaping. Swallowing the exception is deliberate: the standby is // already armed, so a failed primary subscribe just nudges the state // machine toward (or into) standby rather than aborting startup. RunPrimary(() => primary.Subscribe(subscription), "Subscribe"); } /// /// /// While the primary is active, drives primary.PollOnce through /// the failure-counting wrapper. While degraded (standby active), /// drives standby.PollOnce and then runs one failback probe per /// call via — the worker drives this on a /// timer, so one degraded poll equals one probe tick. /// public void PollOnce() { if (disposed) throw new ObjectDisposedException(nameof(FailoverAlarmConsumer)); if (active == Active.Primary) { RunPrimary(() => primary.PollOnce(), "PollOnce"); return; } // Degraded: pump the standby for live transitions, then probe the // primary for recovery. Standby PollOnce is a no-op for the subtag // consumer but kept for symmetry / future standby sources. standby.PollOnce(); ProbeOnce(); } /// /// Runs one failback probe against the (presumed recovering) primary. /// Only meaningful while the standby is active; a no-op otherwise. /// /// /// /// A clean probe (primary PollOnce succeeds without /// throwing) increments the clean-probe counter and, once it reaches /// , fails back to the /// primary. Any probe failure resets the clean-probe counter to 0 so /// the consumer requires a fresh unbroken run before failing back. /// Exposed publicly so tests (and any external scheduler honoring /// cadence) can /// drive it directly. /// /// /// Probe throttle. When /// is greater than /// zero, successive calls to this method are throttled: a probe is /// skipped unless at least that many seconds have elapsed since the /// last probe that was actually executed. When /// is zero, the /// throttle is disabled and every call probes immediately (the default /// used by unit tests). /// /// /// Why PollOnce only — no re-Subscribe. /// Failover does NOT tear down the primary's subscription; /// is single-subscribe and would /// throw on a second call. /// The probe therefore re-polls the still-subscribed primary: /// when the underlying COM provider recovers, PollOnce stops /// throwing and clean probes accumulate toward failback. This covers /// the dominant failure mode (transient COM/provider fault after a /// successful initial subscribe). /// /// /// Known v1 limitation. If the original /// Subscribe itself failed (i.e., the primary never reached a /// subscribed state — only reachable when /// is 1), polling alone /// cannot re-establish the subscription. That edge case is accepted /// for v1: the operator must restart the session to force a fresh /// subscribe attempt. /// /// public void ProbeOnce() { if (disposed) throw new ObjectDisposedException(nameof(FailoverAlarmConsumer)); if (active != Active.Standby) return; // Throttle probes to the configured cadence. When ProbeIntervalSeconds // is 0 the throttle is disabled and every call probes immediately. if (settings.ProbeIntervalSeconds > 0 && (DateTime.UtcNow - lastProbeAtUtc).TotalSeconds < settings.ProbeIntervalSeconds) { return; } lastProbeAtUtc = DateTime.UtcNow; try { // Re-poll the still-subscribed primary. Do NOT call Subscribe — // WnWrapAlarmConsumer is single-subscribe and the primary remains // subscribed across the failover; calling Subscribe again would // always throw InvalidOperationException and prevent failback. primary.PollOnce(); } catch (Exception) { // Probe failed — the primary is still unhealthy. Demand a fresh // unbroken run of StableProbes clean polls before failing back. cleanProbes = 0; return; } cleanProbes++; if (cleanProbes >= settings.StableProbes) { SwitchToPrimary("recovered", 0); } } /// public int AcknowledgeByGuid( Guid alarmGuid, string ackComment, string ackOperatorName, string ackOperatorNode, string ackOperatorDomain, string ackOperatorFullName) { if (disposed) throw new ObjectDisposedException(nameof(FailoverAlarmConsumer)); return ActiveChild.AcknowledgeByGuid( alarmGuid, ackComment, ackOperatorName, ackOperatorNode, ackOperatorDomain, ackOperatorFullName); } /// public int AcknowledgeByName( string alarmName, string providerName, string groupName, string ackComment, string ackOperatorName, string ackOperatorNode, string ackOperatorDomain, string ackOperatorFullName) { if (disposed) throw new ObjectDisposedException(nameof(FailoverAlarmConsumer)); return ActiveChild.AcknowledgeByName( alarmName, providerName, groupName, ackComment, ackOperatorName, ackOperatorNode, ackOperatorDomain, ackOperatorFullName); } /// public IReadOnlyList SnapshotActiveAlarms() { if (disposed) throw new ObjectDisposedException(nameof(FailoverAlarmConsumer)); return ActiveChild.SnapshotActiveAlarms(); } private IMxAccessAlarmConsumer ActiveChild => active == Active.Primary ? primary : standby; /// /// Runs a primary COM action, counting consecutive failures. A /// (or any exception, treated as a COM /// failure) increments the failure counter and, at /// while the primary is still /// active, switches to the standby. A success resets the counter. /// private void RunPrimary(Action action, string operation) { try { action(); } catch (Exception ex) when (ex is not OutOfMemoryException) { consecutiveFailures++; int hresult = ex is COMException ? ex.HResult : 0; if (active == Active.Primary && consecutiveFailures >= settings.Threshold) { SwitchToStandby($"primary {operation} failed", hresult); } return; } consecutiveFailures = 0; } private void SwitchToStandby(string reason, int hresult) { active = Active.Standby; mode = AlarmProviderMode.Subtag; consecutiveFailures = 0; cleanProbes = 0; // Warm the standby snapshot for the gateway hand-off. The gateway // reconciles state from this snapshot, so the return value is not // consumed here — the call exists for its priming side effect. _ = standby.SnapshotActiveAlarms(); RaiseModeChanged(AlarmProviderMode.Subtag, reason, hresult); } private void SwitchToPrimary(string reason, int hresult) { active = Active.Primary; mode = AlarmProviderMode.Alarmmgr; consecutiveFailures = 0; cleanProbes = 0; RaiseModeChanged(AlarmProviderMode.Alarmmgr, reason, hresult); } private void RaiseModeChanged(AlarmProviderMode newMode, string reason, int hresult) { ProviderModeChanged?.Invoke( this, new AlarmProviderModeChange(newMode, reason, hresult, DateTime.UtcNow)); } private void OnChildTransition(object? sender, MxAlarmTransitionEvent e) { // Gate by active child: forward only the active source's transitions. if (ReferenceEquals(sender, ActiveChild)) { AlarmTransitionEmitted?.Invoke(this, e); } } /// public void Dispose() { if (disposed) return; disposed = true; primary.AlarmTransitionEmitted -= OnChildTransition; standby.AlarmTransitionEmitted -= OnChildTransition; primary.Dispose(); standby.Dispose(); } }