diff --git a/src/ZB.MOM.WW.MxGateway.Worker.Tests/MxAccess/FailoverAlarmConsumerTests.cs b/src/ZB.MOM.WW.MxGateway.Worker.Tests/MxAccess/FailoverAlarmConsumerTests.cs new file mode 100644 index 0000000..bb15a64 --- /dev/null +++ b/src/ZB.MOM.WW.MxGateway.Worker.Tests/MxAccess/FailoverAlarmConsumerTests.cs @@ -0,0 +1,211 @@ +using System; +using System.Collections.Generic; +using ZB.MOM.WW.MxGateway.Contracts.Proto; +using ZB.MOM.WW.MxGateway.Worker.MxAccess; +using Xunit; + +namespace ZB.MOM.WW.MxGateway.Worker.Tests.MxAccess; + +/// +/// Unit tests for : prove the +/// auto-failover (consecutive primary COM failures → standby) and +/// auto-failback (consecutive clean probes → primary) state machine, +/// active-child transition forwarding, and active-child delegation of +/// acknowledgments. Fakes stand in for both children so this needs no +/// AVEVA install. +/// +public sealed class FailoverAlarmConsumerTests +{ + /// + /// Primary fake whose Subscribe/PollOnce throw a COMException while + /// is set, modeling a wnwrap consumer that + /// surfaces COM HRESULT failures. Can also re-raise a transition so + /// before-failover forwarding can be exercised. + /// + private sealed class FlakyPrimary : IMxAccessAlarmConsumer + { + public event EventHandler? AlarmTransitionEmitted; + + public bool ThrowOnPoll = true; + public int Polls; + + public void Subscribe(string s) + { + if (ThrowOnPoll) + { + throw new System.Runtime.InteropServices.COMException("boom", unchecked((int)0x80004005)); + } + } + + public void PollOnce() + { + Polls++; + if (ThrowOnPoll) + { + throw new System.Runtime.InteropServices.COMException("boom", unchecked((int)0x80004005)); + } + } + + public int AcknowledgeByGuid(Guid g, string c, string a, string b, string d, string e) => 11; + + public int AcknowledgeByName(string n, string p, string gr, string c, string a, string b, string d, string e) => 11; + + public IReadOnlyList SnapshotActiveAlarms() => Array.Empty(); + + public void Dispose() { } + + public void Raise(MxAlarmTransitionEvent e) => AlarmTransitionEmitted?.Invoke(this, e); + } + + /// + /// Standby fake (subtag stand-in): never throws, records that it was + /// armed, and can re-raise a transition. + /// + private sealed class StubStandby : IMxAccessAlarmConsumer + { + public event EventHandler? AlarmTransitionEmitted; + + public bool Subscribed; + + public void Subscribe(string s) => Subscribed = true; + + public void PollOnce() { } + + public int AcknowledgeByGuid(Guid g, string c, string a, string b, string d, string e) => 22; + + public int AcknowledgeByName(string n, string p, string gr, string c, string a, string b, string d, string e) => 22; + + public IReadOnlyList SnapshotActiveAlarms() => Array.Empty(); + + public void Dispose() { } + + public void Raise(MxAlarmTransitionEvent e) => AlarmTransitionEmitted?.Invoke(this, e); + } + + private static MxAlarmTransitionEvent SampleTransition() => new MxAlarmTransitionEvent + { + Record = new MxAlarmSnapshotRecord { AlarmGuid = Guid.NewGuid() }, + PreviousState = MxAlarmStateKind.Unspecified, + }; + + [Fact] + public void Primary_FailsThresholdTimes_SwitchesToSubtag() + { + FlakyPrimary primary = new FlakyPrimary { ThrowOnPoll = true }; + StubStandby standby = new StubStandby(); + FailoverSettings settings = new FailoverSettings(threshold: 3, probeIntervalSeconds: 0, stableProbes: 1); + using FailoverAlarmConsumer sut = new FailoverAlarmConsumer(primary, standby, settings); + + List changes = new List(); + sut.ProviderModeChanged += (_, e) => changes.Add(e); + + sut.Subscribe(@"\\HOST\Galaxy!Area"); // failure 1 (primary), standby armed + Assert.True(standby.Subscribed); + Assert.Empty(changes); + + sut.PollOnce(); // failure 2 + Assert.Empty(changes); + + sut.PollOnce(); // failure 3 → switch + + Assert.Single(changes); + Assert.Equal(AlarmProviderMode.Subtag, changes[0].Mode); + Assert.Equal(AlarmProviderMode.Subtag, sut.Mode); + Assert.Equal(unchecked((int)0x80004005), changes[0].HResult); + } + + [Fact] + public void AfterSwitch_StandbyTransitionsAreForwarded() + { + FlakyPrimary primary = new FlakyPrimary { ThrowOnPoll = true }; + StubStandby standby = new StubStandby(); + FailoverSettings settings = new FailoverSettings(threshold: 1, probeIntervalSeconds: 0, stableProbes: 1); + using FailoverAlarmConsumer sut = new FailoverAlarmConsumer(primary, standby, settings); + + MxAlarmTransitionEvent? forwarded = null; + sut.AlarmTransitionEmitted += (_, e) => forwarded = e; + + sut.Subscribe(@"\\HOST\Galaxy!Area"); // threshold=1 → switch to Subtag immediately + Assert.Equal(AlarmProviderMode.Subtag, sut.Mode); + + MxAlarmTransitionEvent transition = SampleTransition(); + standby.Raise(transition); + + Assert.Same(transition, forwarded); + } + + [Fact] + public void WhileDegraded_PrimaryHeals_FailsBackAfterStableProbes() + { + FlakyPrimary primary = new FlakyPrimary { ThrowOnPoll = true }; + StubStandby standby = new StubStandby(); + FailoverSettings settings = new FailoverSettings(threshold: 1, probeIntervalSeconds: 0, stableProbes: 2); + using FailoverAlarmConsumer sut = new FailoverAlarmConsumer(primary, standby, settings); + + List changes = new List(); + sut.ProviderModeChanged += (_, e) => changes.Add(e); + + sut.Subscribe(@"\\HOST\Galaxy!Area"); // threshold=1 → Subtag (change 1) + Assert.Single(changes); + Assert.Equal(AlarmProviderMode.Subtag, changes[^1].Mode); + + primary.ThrowOnPoll = false; // primary heals + + sut.ProbeOnce(); // clean 1 (no failback yet) + Assert.Single(changes); + + sut.ProbeOnce(); // clean 2 → failback + + Assert.Equal(2, changes.Count); + Assert.Equal(AlarmProviderMode.Alarmmgr, changes[^1].Mode); + Assert.Equal(AlarmProviderMode.Alarmmgr, sut.Mode); + Assert.Equal(0, changes[^1].HResult); + } + + [Fact] + public void BeforeFailover_PrimaryTransitionsAreForwarded() + { + FlakyPrimary primary = new FlakyPrimary { ThrowOnPoll = false }; // healthy, can Raise + StubStandby standby = new StubStandby(); + FailoverSettings settings = new FailoverSettings(threshold: 3, probeIntervalSeconds: 0, stableProbes: 1); + using FailoverAlarmConsumer sut = new FailoverAlarmConsumer(primary, standby, settings); + + List forwarded = new List(); + sut.AlarmTransitionEmitted += (_, e) => forwarded.Add(e); + + sut.Subscribe(@"\\HOST\Galaxy!Area"); + Assert.Equal(AlarmProviderMode.Alarmmgr, sut.Mode); + + MxAlarmTransitionEvent fromPrimary = SampleTransition(); + primary.Raise(fromPrimary); // active=Primary → forwarded + Assert.Single(forwarded); + Assert.Same(fromPrimary, forwarded[0]); + + standby.Raise(SampleTransition()); // standby not active → suppressed + Assert.Single(forwarded); + } + + [Fact] + public void Acknowledge_DelegatesToActiveChild() + { + FlakyPrimary primary = new FlakyPrimary { ThrowOnPoll = false }; + StubStandby standby = new StubStandby(); + FailoverSettings settings = new FailoverSettings(threshold: 1, probeIntervalSeconds: 0, stableProbes: 1); + using FailoverAlarmConsumer sut = new FailoverAlarmConsumer(primary, standby, settings); + + sut.Subscribe(@"\\HOST\Galaxy!Area"); + + // Active = Primary → primary's sentinel value (11). + Assert.Equal(11, sut.AcknowledgeByGuid(Guid.NewGuid(), "c", "n", "node", "dom", "full")); + Assert.Equal(11, sut.AcknowledgeByName("a", "p", "g", "c", "n", "node", "dom", "full")); + + // Force a failover by failing the primary past threshold. + primary.ThrowOnPoll = true; + sut.PollOnce(); // threshold=1 → switch to Standby + Assert.Equal(AlarmProviderMode.Subtag, sut.Mode); + + // Active = Standby → standby's sentinel value (22). + Assert.Equal(22, sut.AcknowledgeByGuid(Guid.NewGuid(), "c", "n", "node", "dom", "full")); + Assert.Equal(22, sut.AcknowledgeByName("a", "p", "g", "c", "n", "node", "dom", "full")); + } +} diff --git a/src/ZB.MOM.WW.MxGateway.Worker/MxAccess/AlarmProviderModeChange.cs b/src/ZB.MOM.WW.MxGateway.Worker/MxAccess/AlarmProviderModeChange.cs new file mode 100644 index 0000000..a38dfb2 --- /dev/null +++ b/src/ZB.MOM.WW.MxGateway.Worker/MxAccess/AlarmProviderModeChange.cs @@ -0,0 +1,61 @@ +using System; +using ZB.MOM.WW.MxGateway.Contracts.Proto; + +namespace ZB.MOM.WW.MxGateway.Worker.MxAccess; + +/// +/// Raised by every time the active +/// alarm source switches between the primary (alarmmgr) consumer and the +/// standby (subtag) consumer. The worker translates this into the proto +/// family OnAlarmProviderModeChanged so connected gateway clients +/// can surface the degraded/recovered state. +/// +/// +/// +/// Plain class with constructor-assigned get-only properties — not a +/// record or init-only type — because the worker +/// multi-targets .NET Framework 4.8, which lacks +/// System.Runtime.CompilerServices.IsExternalInit (CS0518). +/// +/// +public sealed class AlarmProviderModeChange : EventArgs +{ + /// + /// Initializes the change event payload. + /// + /// The provider mode now active after the switch. + /// Human-readable reason for the switch. + /// + /// The COM HRESULT that triggered a failover, or 0 for a clean + /// failback / no associated HRESULT. + /// + /// The UTC instant the switch occurred. + public AlarmProviderModeChange(AlarmProviderMode mode, string reason, int hResult, DateTime atUtc) + { + Mode = mode; + Reason = reason ?? string.Empty; + HResult = hResult; + AtUtc = atUtc; + } + + /// + /// The provider mode now active after the switch. + /// + public AlarmProviderMode Mode { get; } + + /// + /// Human-readable reason for the switch (e.g. the failing COM call, or + /// "recovered" for a failback). + /// + public string Reason { get; } + + /// + /// The COM HRESULT that triggered a failover, or 0 when none applies. + /// + public int HResult { get; } + + /// + /// The UTC instant the switch occurred. + /// + public DateTime AtUtc { get; } +} diff --git a/src/ZB.MOM.WW.MxGateway.Worker/MxAccess/FailoverAlarmConsumer.cs b/src/ZB.MOM.WW.MxGateway.Worker/MxAccess/FailoverAlarmConsumer.cs new file mode 100644 index 0000000..58fa6e1 --- /dev/null +++ b/src/ZB.MOM.WW.MxGateway.Worker/MxAccess/FailoverAlarmConsumer.cs @@ -0,0 +1,311 @@ +using System; +using System.Collections.Generic; +using System.Runtime.InteropServices; +using ZB.MOM.WW.MxGateway.Contracts.Proto; + +namespace ZB.MOM.WW.MxGateway.Worker.MxAccess; + +/// +/// Composite that owns a PRIMARY +/// consumer (the wnwrap alarmmgr source) +/// and a STANDBY consumer (the SubtagAlarmConsumer subtag fallback), +/// and switches between them automatically: +/// +/// +/// Auto-fails-over to standby after +/// consecutive COM +/// failures on the primary. +/// +/// +/// Auto-fails-back to primary after +/// consecutive clean +/// failback probes against the recovering primary. +/// +/// +/// It re-raises from whichever child +/// is active and raises on every switch. +/// +/// +/// +/// Active-child event forwarding. This type subscribes +/// to both children's +/// events up front and gates re-raising on identity: a child transition +/// is forwarded only when its sender is the currently active +/// child. The standby is armed (subscribed) from the start so its +/// snapshot is warm at the moment of failover, but its transitions stay +/// suppressed until it becomes active. Gating-by-active is simpler and +/// less error-prone than subscribe/unsubscribe churn on every switch, +/// and it avoids a race where a transition fires during the switch. +/// +/// +/// Threading. Like its children, this type is driven +/// entirely on the worker's STA: , +/// , , and the +/// AcknowledgeBy* calls are all invoked from the apartment that +/// owns the underlying COM objects. It owns no locks of its own and no +/// internal timer; the worker drives on a timer. +/// +/// +public sealed class FailoverAlarmConsumer : IMxAccessAlarmConsumer +{ + private enum Active + { + Primary, + Standby, + } + + private readonly IMxAccessAlarmConsumer primary; + private readonly IMxAccessAlarmConsumer standby; + private readonly FailoverSettings settings; + + private Active active = Active.Primary; + private AlarmProviderMode mode = AlarmProviderMode.Alarmmgr; + private int consecutiveFailures; + private int cleanProbes; + private bool disposed; + + /// + /// Composes the failover consumer over its two children. + /// + /// The PRIMARY (alarmmgr) consumer. + /// The STANDBY (subtag) consumer. + /// The failover/failback tunables. + public FailoverAlarmConsumer( + IMxAccessAlarmConsumer primary, + IMxAccessAlarmConsumer standby, + FailoverSettings settings) + { + this.primary = primary ?? throw new ArgumentNullException(nameof(primary)); + this.standby = standby ?? throw new ArgumentNullException(nameof(standby)); + this.settings = settings ?? throw new ArgumentNullException(nameof(settings)); + + this.primary.AlarmTransitionEmitted += OnChildTransition; + this.standby.AlarmTransitionEmitted += OnChildTransition; + } + + /// + public event EventHandler? AlarmTransitionEmitted; + + /// + /// Fires on every switch between primary and standby. Carries the new + /// , the reason, the triggering HRESULT + /// (0 for a clean failback), and the UTC instant. + /// + public event EventHandler? ProviderModeChanged; + + /// + /// The provider mode currently active. + /// + public AlarmProviderMode Mode => mode; + + /// + /// + /// Arms BOTH children up front so the standby snapshot is warm at the + /// moment of failover. The standby is always subscribed even if the + /// primary's Subscribe throws; a standby subscribe failure is + /// surfaced (rethrown) but does not count toward primary failover. The + /// primary subscribe runs through the failure-counting wrapper so a + /// COM failure on subscribe contributes to the failover threshold. + /// + public void Subscribe(string subscription) + { + if (disposed) throw new ObjectDisposedException(nameof(FailoverAlarmConsumer)); + + // Arm the standby first so it is warm regardless of primary outcome. + // A standby subscribe failure is a hard fault (the fallback itself is + // broken) and is surfaced to the caller; it does not feed the primary + // failover counter. + standby.Subscribe(subscription); + + // Drive the primary subscribe through the failure-counting wrapper so + // a COM failure here counts toward the failover threshold instead of + // escaping. Swallowing the exception is deliberate: the standby is + // already armed, so a failed primary subscribe just nudges the state + // machine toward (or into) standby rather than aborting startup. + RunPrimary(() => primary.Subscribe(subscription), "Subscribe"); + } + + /// + /// + /// While the primary is active, drives primary.PollOnce through + /// the failure-counting wrapper. While degraded (standby active), + /// drives standby.PollOnce and then runs one failback probe per + /// call via — the worker drives this on a + /// timer, so one degraded poll equals one probe tick. + /// + public void PollOnce() + { + if (disposed) throw new ObjectDisposedException(nameof(FailoverAlarmConsumer)); + + if (active == Active.Primary) + { + RunPrimary(() => primary.PollOnce(), "PollOnce"); + return; + } + + // Degraded: pump the standby for live transitions, then probe the + // primary for recovery. Standby PollOnce is a no-op for the subtag + // consumer but kept for symmetry / future standby sources. + standby.PollOnce(); + ProbeOnce(); + } + + /// + /// Runs one failback probe against the (presumed recovering) primary. + /// Only meaningful while the standby is active; a no-op otherwise. + /// + /// + /// A clean probe (primary Subscribe + PollOnce both + /// succeed) increments the clean-probe counter and, once it reaches + /// , fails back to the + /// primary. Any probe failure resets the clean-probe counter to 0 so + /// the consumer requires a fresh unbroken run before failing back. + /// Exposed publicly so tests (and any external scheduler honoring + /// cadence) can + /// drive it directly. + /// + public void ProbeOnce() + { + if (disposed) throw new ObjectDisposedException(nameof(FailoverAlarmConsumer)); + if (active != Active.Standby) return; + + try + { + primary.Subscribe(string.Empty); + primary.PollOnce(); + } + catch (Exception) + { + // Probe failed — the primary is still unhealthy. Demand a fresh + // unbroken run of StableProbes clean polls before failing back. + cleanProbes = 0; + return; + } + + cleanProbes++; + if (cleanProbes >= settings.StableProbes) + { + SwitchToPrimary("recovered", 0); + } + } + + /// + public int AcknowledgeByGuid( + Guid alarmGuid, + string ackComment, + string ackOperatorName, + string ackOperatorNode, + string ackOperatorDomain, + string ackOperatorFullName) + { + if (disposed) throw new ObjectDisposedException(nameof(FailoverAlarmConsumer)); + return ActiveChild.AcknowledgeByGuid( + alarmGuid, ackComment, ackOperatorName, ackOperatorNode, ackOperatorDomain, ackOperatorFullName); + } + + /// + public int AcknowledgeByName( + string alarmName, + string providerName, + string groupName, + string ackComment, + string ackOperatorName, + string ackOperatorNode, + string ackOperatorDomain, + string ackOperatorFullName) + { + if (disposed) throw new ObjectDisposedException(nameof(FailoverAlarmConsumer)); + return ActiveChild.AcknowledgeByName( + alarmName, providerName, groupName, ackComment, + ackOperatorName, ackOperatorNode, ackOperatorDomain, ackOperatorFullName); + } + + /// + public IReadOnlyList SnapshotActiveAlarms() + { + if (disposed) throw new ObjectDisposedException(nameof(FailoverAlarmConsumer)); + return ActiveChild.SnapshotActiveAlarms(); + } + + private IMxAccessAlarmConsumer ActiveChild => active == Active.Primary ? primary : standby; + + /// + /// Runs a primary COM action, counting consecutive failures. A + /// (or any exception, treated as a COM + /// failure) increments the failure counter and, at + /// while the primary is still + /// active, switches to the standby. A success resets the counter. + /// + private void RunPrimary(Action action, string operation) + { + try + { + action(); + } + catch (Exception ex) + { + consecutiveFailures++; + int hresult = ex is COMException ? ex.HResult : 0; + if (active == Active.Primary && consecutiveFailures >= settings.Threshold) + { + SwitchToStandby($"primary {operation} failed", hresult); + } + return; + } + + consecutiveFailures = 0; + } + + private void SwitchToStandby(string reason, int hresult) + { + active = Active.Standby; + mode = AlarmProviderMode.Subtag; + consecutiveFailures = 0; + cleanProbes = 0; + + // Warm the standby snapshot for the gateway hand-off. The gateway + // reconciles state from this snapshot, so the return value is not + // consumed here — the call exists for its priming side effect. + _ = standby.SnapshotActiveAlarms(); + + RaiseModeChanged(AlarmProviderMode.Subtag, reason, hresult); + } + + private void SwitchToPrimary(string reason, int hresult) + { + active = Active.Primary; + mode = AlarmProviderMode.Alarmmgr; + consecutiveFailures = 0; + cleanProbes = 0; + RaiseModeChanged(AlarmProviderMode.Alarmmgr, reason, hresult); + } + + private void RaiseModeChanged(AlarmProviderMode newMode, string reason, int hresult) + { + ProviderModeChanged?.Invoke( + this, + new AlarmProviderModeChange(newMode, reason, hresult, DateTime.UtcNow)); + } + + private void OnChildTransition(object? sender, MxAlarmTransitionEvent e) + { + // Gate by active child: forward only the active source's transitions. + if (ReferenceEquals(sender, ActiveChild)) + { + AlarmTransitionEmitted?.Invoke(this, e); + } + } + + /// + public void Dispose() + { + if (disposed) return; + disposed = true; + + primary.AlarmTransitionEmitted -= OnChildTransition; + standby.AlarmTransitionEmitted -= OnChildTransition; + + primary.Dispose(); + standby.Dispose(); + } +} diff --git a/src/ZB.MOM.WW.MxGateway.Worker/MxAccess/FailoverSettings.cs b/src/ZB.MOM.WW.MxGateway.Worker/MxAccess/FailoverSettings.cs new file mode 100644 index 0000000..03dc11a --- /dev/null +++ b/src/ZB.MOM.WW.MxGateway.Worker/MxAccess/FailoverSettings.cs @@ -0,0 +1,57 @@ +namespace ZB.MOM.WW.MxGateway.Worker.MxAccess; + +/// +/// Tunables for 's auto-failover / +/// auto-failback state machine. Constructor-clamped to safe minimums so a +/// misconfigured options bind can never produce a zero/negative threshold +/// that would either never fail over or fail over on the first hiccup +/// unintentionally. +/// +/// +/// +/// Plain class with constructor-assigned get-only properties — not a +/// record or init-only type — because the worker +/// multi-targets .NET Framework 4.8, which lacks +/// System.Runtime.CompilerServices.IsExternalInit and so cannot +/// compile init accessors or positional records (CS0518). +/// +/// +public sealed class FailoverSettings +{ + /// + /// Initializes the settings, clamping each value to its safe minimum. + /// + /// + /// Consecutive primary COM failures that trigger a switch to standby. + /// Clamped to a minimum of 1. + /// + /// + /// Minimum spacing (seconds) between failback probes against the + /// recovering primary. Clamped to a minimum of 0 (probe every tick). + /// + /// + /// Consecutive clean failback probes required before switching back to + /// the primary. Clamped to a minimum of 1. + /// + public FailoverSettings(int threshold, int probeIntervalSeconds, int stableProbes) + { + Threshold = threshold < 1 ? 1 : threshold; + ProbeIntervalSeconds = probeIntervalSeconds < 0 ? 0 : probeIntervalSeconds; + StableProbes = stableProbes < 1 ? 1 : stableProbes; + } + + /// + /// Consecutive primary COM failures that trigger a switch to standby. + /// + public int Threshold { get; } + + /// + /// Minimum spacing, in seconds, between failback probes. + /// + public int ProbeIntervalSeconds { get; } + + /// + /// Consecutive clean failback probes required before failing back. + /// + public int StableProbes { get; } +}