From d6c0bb41cae9e79ba4193ad5444e141dbef92462 Mon Sep 17 00:00:00 2001 From: Joseph Doherty Date: Sat, 13 Jun 2026 09:49:38 -0400 Subject: [PATCH] worker(alarms): failback probe re-polls the still-subscribed primary (no re-Subscribe) --- .../MxAccess/FailoverAlarmConsumerTests.cs | 63 +++++++++++++++++-- .../MxAccess/FailoverAlarmConsumer.cs | 54 +++++++++++++--- 2 files changed, 104 insertions(+), 13 deletions(-) diff --git a/src/ZB.MOM.WW.MxGateway.Worker.Tests/MxAccess/FailoverAlarmConsumerTests.cs b/src/ZB.MOM.WW.MxGateway.Worker.Tests/MxAccess/FailoverAlarmConsumerTests.cs index bb15a64..da293cc 100644 --- a/src/ZB.MOM.WW.MxGateway.Worker.Tests/MxAccess/FailoverAlarmConsumerTests.cs +++ b/src/ZB.MOM.WW.MxGateway.Worker.Tests/MxAccess/FailoverAlarmConsumerTests.cs @@ -29,8 +29,16 @@ public sealed class FailoverAlarmConsumerTests public bool ThrowOnPoll = true; public int Polls; + /// + /// Number of times has been called. + /// Incremented at entry, before any throw, so every attempt is + /// counted regardless of whether is set. + /// + public int SubscribeCount; + public void Subscribe(string s) { + SubscribeCount++; if (ThrowOnPoll) { throw new System.Runtime.InteropServices.COMException("boom", unchecked((int)0x80004005)); @@ -137,6 +145,10 @@ public sealed class FailoverAlarmConsumerTests [Fact] public void WhileDegraded_PrimaryHeals_FailsBackAfterStableProbes() { + // threshold=1 so the initial Subscribe failure (PollOnce path) immediately + // switches to Subtag. stableProbes=2 means two consecutive clean PollOnce + // calls are needed before failback. ProbeOnce must NOT call Subscribe — + // WnWrapAlarmConsumer is single-subscribe; re-calling would always throw. FlakyPrimary primary = new FlakyPrimary { ThrowOnPoll = true }; StubStandby standby = new StubStandby(); FailoverSettings settings = new FailoverSettings(threshold: 1, probeIntervalSeconds: 0, stableProbes: 2); @@ -145,21 +157,27 @@ public sealed class FailoverAlarmConsumerTests List changes = new List(); sut.ProviderModeChanged += (_, e) => changes.Add(e); - sut.Subscribe(@"\\HOST\Galaxy!Area"); // threshold=1 → Subtag (change 1) + sut.Subscribe(@"\\HOST\Galaxy!Area"); // threshold=1 → Subtag (mode change 1) Assert.Single(changes); Assert.Equal(AlarmProviderMode.Subtag, changes[^1].Mode); - primary.ThrowOnPoll = false; // primary heals + // Primary heals: PollOnce stops throwing. ProbeOnce should call only + // PollOnce (not Subscribe) to detect recovery. + primary.ThrowOnPoll = false; + int subscribeCountAfterFailover = primary.SubscribeCount; - sut.ProbeOnce(); // clean 1 (no failback yet) + sut.ProbeOnce(); // cleanProbes=1 — not yet at stableProbes=2 Assert.Single(changes); - sut.ProbeOnce(); // clean 2 → failback + sut.ProbeOnce(); // cleanProbes=2 → failback to Alarmmgr (mode change 2) Assert.Equal(2, changes.Count); Assert.Equal(AlarmProviderMode.Alarmmgr, changes[^1].Mode); Assert.Equal(AlarmProviderMode.Alarmmgr, sut.Mode); Assert.Equal(0, changes[^1].HResult); + + // ProbeOnce must not have called Subscribe at all during probing. + Assert.Equal(subscribeCountAfterFailover, primary.SubscribeCount); } [Fact] @@ -185,6 +203,43 @@ public sealed class FailoverAlarmConsumerTests Assert.Single(forwarded); } + /// + /// Proves that never calls + /// Subscribe on the primary while degraded. The production primary + /// () is single-subscribe; a second + /// Subscribe call would always throw and make failback impossible. + /// The probe must re-poll the still-subscribed primary via + /// PollOnce only. + /// + [Fact] + public void ProbeOnce_DoesNotCallPrimarySubscribe() + { + // threshold=1 → first Subscribe failure immediately switches to Subtag. + FlakyPrimary primary = new FlakyPrimary { ThrowOnPoll = true }; + StubStandby standby = new StubStandby(); + FailoverSettings settings = new FailoverSettings(threshold: 1, probeIntervalSeconds: 0, stableProbes: 3); + using FailoverAlarmConsumer sut = new FailoverAlarmConsumer(primary, standby, settings); + + sut.Subscribe(@"\\HOST\Galaxy!Area"); // Subscribe attempt #1 (throws) → Subtag + + // Capture how many Subscribe calls the initial setup caused (exactly 1: + // the attempt that threw and triggered failover). + int subscribeCountAfterSetup = primary.SubscribeCount; + Assert.Equal(1, subscribeCountAfterSetup); + Assert.Equal(AlarmProviderMode.Subtag, sut.Mode); + + // Let PollOnce succeed so ProbeOnce progresses without throwing. + primary.ThrowOnPoll = false; + + // Drive several ProbeOnce calls — none should touch Subscribe. + sut.ProbeOnce(); + sut.ProbeOnce(); + sut.ProbeOnce(); // stableProbes=3 → failback on this call + + Assert.Equal(AlarmProviderMode.Alarmmgr, sut.Mode); + Assert.Equal(subscribeCountAfterSetup, primary.SubscribeCount); + } + [Fact] public void Acknowledge_DelegatesToActiveChild() { diff --git a/src/ZB.MOM.WW.MxGateway.Worker/MxAccess/FailoverAlarmConsumer.cs b/src/ZB.MOM.WW.MxGateway.Worker/MxAccess/FailoverAlarmConsumer.cs index 58fa6e1..35cd513 100644 --- a/src/ZB.MOM.WW.MxGateway.Worker/MxAccess/FailoverAlarmConsumer.cs +++ b/src/ZB.MOM.WW.MxGateway.Worker/MxAccess/FailoverAlarmConsumer.cs @@ -64,6 +64,13 @@ public sealed class FailoverAlarmConsumer : IMxAccessAlarmConsumer private int cleanProbes; private bool disposed; + /// + /// The subscription expression passed to . + /// Stored for documentation and potential future full re-subscribe + /// scenarios; the primary is NOT re-subscribed during probing. + /// + private string subscriptionExpression = string.Empty; + /// /// Composes the failover consumer over its two children. /// @@ -111,6 +118,10 @@ public sealed class FailoverAlarmConsumer : IMxAccessAlarmConsumer { if (disposed) throw new ObjectDisposedException(nameof(FailoverAlarmConsumer)); + // Store for documentation; the primary is not torn down on failover + // and is therefore not re-subscribed during ProbeOnce. + subscriptionExpression = subscription; + // Arm the standby first so it is warm regardless of primary outcome. // A standby subscribe failure is a hard fault (the fallback itself is // broken) and is surfaced to the caller; it does not feed the primary @@ -155,14 +166,36 @@ public sealed class FailoverAlarmConsumer : IMxAccessAlarmConsumer /// Only meaningful while the standby is active; a no-op otherwise. /// /// - /// A clean probe (primary Subscribe + PollOnce both - /// succeed) increments the clean-probe counter and, once it reaches - /// , fails back to the - /// primary. Any probe failure resets the clean-probe counter to 0 so - /// the consumer requires a fresh unbroken run before failing back. - /// Exposed publicly so tests (and any external scheduler honoring - /// cadence) can - /// drive it directly. + /// + /// A clean probe (primary PollOnce succeeds without + /// throwing) increments the clean-probe counter and, once it reaches + /// , fails back to the + /// primary. Any probe failure resets the clean-probe counter to 0 so + /// the consumer requires a fresh unbroken run before failing back. + /// Exposed publicly so tests (and any external scheduler honoring + /// cadence) can + /// drive it directly. + /// + /// + /// Why PollOnce only — no re-Subscribe. + /// Failover does NOT tear down the primary's subscription; + /// is single-subscribe and would + /// throw on a second call. + /// The probe therefore re-polls the still-subscribed primary: + /// when the underlying COM provider recovers, PollOnce stops + /// throwing and clean probes accumulate toward failback. This covers + /// the dominant failure mode (transient COM/provider fault after a + /// successful initial subscribe). + /// + /// + /// Known v1 limitation. If the original + /// Subscribe itself failed (i.e., the primary never reached a + /// subscribed state — only reachable when + /// is 1), polling alone + /// cannot re-establish the subscription. That edge case is accepted + /// for v1: the operator must restart the session to force a fresh + /// subscribe attempt. + /// /// public void ProbeOnce() { @@ -171,7 +204,10 @@ public sealed class FailoverAlarmConsumer : IMxAccessAlarmConsumer try { - primary.Subscribe(string.Empty); + // Re-poll the still-subscribed primary. Do NOT call Subscribe — + // WnWrapAlarmConsumer is single-subscribe and the primary remains + // subscribed across the failover; calling Subscribe again would + // always throw InvalidOperationException and prevent failback. primary.PollOnce(); } catch (Exception)