diff --git a/src/ZB.MOM.WW.MxGateway.Worker.Tests/MxAccess/FailoverAlarmConsumerTests.cs b/src/ZB.MOM.WW.MxGateway.Worker.Tests/MxAccess/FailoverAlarmConsumerTests.cs
index bb15a64..da293cc 100644
--- a/src/ZB.MOM.WW.MxGateway.Worker.Tests/MxAccess/FailoverAlarmConsumerTests.cs
+++ b/src/ZB.MOM.WW.MxGateway.Worker.Tests/MxAccess/FailoverAlarmConsumerTests.cs
@@ -29,8 +29,16 @@ public sealed class FailoverAlarmConsumerTests
public bool ThrowOnPoll = true;
public int Polls;
+ ///
+ /// Number of times has been called.
+ /// Incremented at entry, before any throw, so every attempt is
+ /// counted regardless of whether is set.
+ ///
+ public int SubscribeCount;
+
public void Subscribe(string s)
{
+ SubscribeCount++;
if (ThrowOnPoll)
{
throw new System.Runtime.InteropServices.COMException("boom", unchecked((int)0x80004005));
@@ -137,6 +145,10 @@ public sealed class FailoverAlarmConsumerTests
[Fact]
public void WhileDegraded_PrimaryHeals_FailsBackAfterStableProbes()
{
+ // threshold=1 so the initial Subscribe failure (PollOnce path) immediately
+ // switches to Subtag. stableProbes=2 means two consecutive clean PollOnce
+ // calls are needed before failback. ProbeOnce must NOT call Subscribe —
+ // WnWrapAlarmConsumer is single-subscribe; re-calling would always throw.
FlakyPrimary primary = new FlakyPrimary { ThrowOnPoll = true };
StubStandby standby = new StubStandby();
FailoverSettings settings = new FailoverSettings(threshold: 1, probeIntervalSeconds: 0, stableProbes: 2);
@@ -145,21 +157,27 @@ public sealed class FailoverAlarmConsumerTests
List changes = new List();
sut.ProviderModeChanged += (_, e) => changes.Add(e);
- sut.Subscribe(@"\\HOST\Galaxy!Area"); // threshold=1 → Subtag (change 1)
+ sut.Subscribe(@"\\HOST\Galaxy!Area"); // threshold=1 → Subtag (mode change 1)
Assert.Single(changes);
Assert.Equal(AlarmProviderMode.Subtag, changes[^1].Mode);
- primary.ThrowOnPoll = false; // primary heals
+ // Primary heals: PollOnce stops throwing. ProbeOnce should call only
+ // PollOnce (not Subscribe) to detect recovery.
+ primary.ThrowOnPoll = false;
+ int subscribeCountAfterFailover = primary.SubscribeCount;
- sut.ProbeOnce(); // clean 1 (no failback yet)
+ sut.ProbeOnce(); // cleanProbes=1 — not yet at stableProbes=2
Assert.Single(changes);
- sut.ProbeOnce(); // clean 2 → failback
+ sut.ProbeOnce(); // cleanProbes=2 → failback to Alarmmgr (mode change 2)
Assert.Equal(2, changes.Count);
Assert.Equal(AlarmProviderMode.Alarmmgr, changes[^1].Mode);
Assert.Equal(AlarmProviderMode.Alarmmgr, sut.Mode);
Assert.Equal(0, changes[^1].HResult);
+
+ // ProbeOnce must not have called Subscribe at all during probing.
+ Assert.Equal(subscribeCountAfterFailover, primary.SubscribeCount);
}
[Fact]
@@ -185,6 +203,43 @@ public sealed class FailoverAlarmConsumerTests
Assert.Single(forwarded);
}
+ ///
+ /// Proves that never calls
+ /// Subscribe on the primary while degraded. The production primary
+ /// () is single-subscribe; a second
+ /// Subscribe call would always throw and make failback impossible.
+ /// The probe must re-poll the still-subscribed primary via
+ /// PollOnce only.
+ ///
+ [Fact]
+ public void ProbeOnce_DoesNotCallPrimarySubscribe()
+ {
+ // threshold=1 → first Subscribe failure immediately switches to Subtag.
+ FlakyPrimary primary = new FlakyPrimary { ThrowOnPoll = true };
+ StubStandby standby = new StubStandby();
+ FailoverSettings settings = new FailoverSettings(threshold: 1, probeIntervalSeconds: 0, stableProbes: 3);
+ using FailoverAlarmConsumer sut = new FailoverAlarmConsumer(primary, standby, settings);
+
+ sut.Subscribe(@"\\HOST\Galaxy!Area"); // Subscribe attempt #1 (throws) → Subtag
+
+ // Capture how many Subscribe calls the initial setup caused (exactly 1:
+ // the attempt that threw and triggered failover).
+ int subscribeCountAfterSetup = primary.SubscribeCount;
+ Assert.Equal(1, subscribeCountAfterSetup);
+ Assert.Equal(AlarmProviderMode.Subtag, sut.Mode);
+
+ // Let PollOnce succeed so ProbeOnce progresses without throwing.
+ primary.ThrowOnPoll = false;
+
+ // Drive several ProbeOnce calls — none should touch Subscribe.
+ sut.ProbeOnce();
+ sut.ProbeOnce();
+ sut.ProbeOnce(); // stableProbes=3 → failback on this call
+
+ Assert.Equal(AlarmProviderMode.Alarmmgr, sut.Mode);
+ Assert.Equal(subscribeCountAfterSetup, primary.SubscribeCount);
+ }
+
[Fact]
public void Acknowledge_DelegatesToActiveChild()
{
diff --git a/src/ZB.MOM.WW.MxGateway.Worker/MxAccess/FailoverAlarmConsumer.cs b/src/ZB.MOM.WW.MxGateway.Worker/MxAccess/FailoverAlarmConsumer.cs
index 58fa6e1..35cd513 100644
--- a/src/ZB.MOM.WW.MxGateway.Worker/MxAccess/FailoverAlarmConsumer.cs
+++ b/src/ZB.MOM.WW.MxGateway.Worker/MxAccess/FailoverAlarmConsumer.cs
@@ -64,6 +64,13 @@ public sealed class FailoverAlarmConsumer : IMxAccessAlarmConsumer
private int cleanProbes;
private bool disposed;
+ ///
+ /// The subscription expression passed to .
+ /// Stored for documentation and potential future full re-subscribe
+ /// scenarios; the primary is NOT re-subscribed during probing.
+ ///
+ private string subscriptionExpression = string.Empty;
+
///
/// Composes the failover consumer over its two children.
///
@@ -111,6 +118,10 @@ public sealed class FailoverAlarmConsumer : IMxAccessAlarmConsumer
{
if (disposed) throw new ObjectDisposedException(nameof(FailoverAlarmConsumer));
+ // Store for documentation; the primary is not torn down on failover
+ // and is therefore not re-subscribed during ProbeOnce.
+ subscriptionExpression = subscription;
+
// Arm the standby first so it is warm regardless of primary outcome.
// A standby subscribe failure is a hard fault (the fallback itself is
// broken) and is surfaced to the caller; it does not feed the primary
@@ -155,14 +166,36 @@ public sealed class FailoverAlarmConsumer : IMxAccessAlarmConsumer
/// Only meaningful while the standby is active; a no-op otherwise.
///
///
- /// A clean probe (primary Subscribe + PollOnce both
- /// succeed) increments the clean-probe counter and, once it reaches
- /// , fails back to the
- /// primary. Any probe failure resets the clean-probe counter to 0 so
- /// the consumer requires a fresh unbroken run before failing back.
- /// Exposed publicly so tests (and any external scheduler honoring
- /// cadence) can
- /// drive it directly.
+ ///
+ /// A clean probe (primary PollOnce succeeds without
+ /// throwing) increments the clean-probe counter and, once it reaches
+ /// , fails back to the
+ /// primary. Any probe failure resets the clean-probe counter to 0 so
+ /// the consumer requires a fresh unbroken run before failing back.
+ /// Exposed publicly so tests (and any external scheduler honoring
+ /// cadence) can
+ /// drive it directly.
+ ///
+ ///
+ /// Why PollOnce only — no re-Subscribe.
+ /// Failover does NOT tear down the primary's subscription;
+ /// is single-subscribe and would
+ /// throw on a second call.
+ /// The probe therefore re-polls the still-subscribed primary:
+ /// when the underlying COM provider recovers, PollOnce stops
+ /// throwing and clean probes accumulate toward failback. This covers
+ /// the dominant failure mode (transient COM/provider fault after a
+ /// successful initial subscribe).
+ ///
+ ///
+ /// Known v1 limitation. If the original
+ /// Subscribe itself failed (i.e., the primary never reached a
+ /// subscribed state — only reachable when
+ /// is 1), polling alone
+ /// cannot re-establish the subscription. That edge case is accepted
+ /// for v1: the operator must restart the session to force a fresh
+ /// subscribe attempt.
+ ///
///
public void ProbeOnce()
{
@@ -171,7 +204,10 @@ public sealed class FailoverAlarmConsumer : IMxAccessAlarmConsumer
try
{
- primary.Subscribe(string.Empty);
+ // Re-poll the still-subscribed primary. Do NOT call Subscribe —
+ // WnWrapAlarmConsumer is single-subscribe and the primary remains
+ // subscribed across the failover; calling Subscribe again would
+ // always throw InvalidOperationException and prevent failback.
primary.PollOnce();
}
catch (Exception)