worker(alarms): failback probe re-polls the still-subscribed primary (no re-Subscribe)
This commit is contained in:
@@ -29,8 +29,16 @@ public sealed class FailoverAlarmConsumerTests
|
||||
public bool ThrowOnPoll = true;
|
||||
public int Polls;
|
||||
|
||||
/// <summary>
|
||||
/// Number of times <see cref="Subscribe"/> has been called.
|
||||
/// Incremented at entry, before any throw, so every attempt is
|
||||
/// counted regardless of whether <see cref="ThrowOnPoll"/> is set.
|
||||
/// </summary>
|
||||
public int SubscribeCount;
|
||||
|
||||
public void Subscribe(string s)
|
||||
{
|
||||
SubscribeCount++;
|
||||
if (ThrowOnPoll)
|
||||
{
|
||||
throw new System.Runtime.InteropServices.COMException("boom", unchecked((int)0x80004005));
|
||||
@@ -137,6 +145,10 @@ public sealed class FailoverAlarmConsumerTests
|
||||
[Fact]
|
||||
public void WhileDegraded_PrimaryHeals_FailsBackAfterStableProbes()
|
||||
{
|
||||
// threshold=1 so the initial Subscribe failure (PollOnce path) immediately
|
||||
// switches to Subtag. stableProbes=2 means two consecutive clean PollOnce
|
||||
// calls are needed before failback. ProbeOnce must NOT call Subscribe —
|
||||
// WnWrapAlarmConsumer is single-subscribe; re-calling would always throw.
|
||||
FlakyPrimary primary = new FlakyPrimary { ThrowOnPoll = true };
|
||||
StubStandby standby = new StubStandby();
|
||||
FailoverSettings settings = new FailoverSettings(threshold: 1, probeIntervalSeconds: 0, stableProbes: 2);
|
||||
@@ -145,21 +157,27 @@ public sealed class FailoverAlarmConsumerTests
|
||||
List<AlarmProviderModeChange> changes = new List<AlarmProviderModeChange>();
|
||||
sut.ProviderModeChanged += (_, e) => changes.Add(e);
|
||||
|
||||
sut.Subscribe(@"\\HOST\Galaxy!Area"); // threshold=1 → Subtag (change 1)
|
||||
sut.Subscribe(@"\\HOST\Galaxy!Area"); // threshold=1 → Subtag (mode change 1)
|
||||
Assert.Single(changes);
|
||||
Assert.Equal(AlarmProviderMode.Subtag, changes[^1].Mode);
|
||||
|
||||
primary.ThrowOnPoll = false; // primary heals
|
||||
// Primary heals: PollOnce stops throwing. ProbeOnce should call only
|
||||
// PollOnce (not Subscribe) to detect recovery.
|
||||
primary.ThrowOnPoll = false;
|
||||
int subscribeCountAfterFailover = primary.SubscribeCount;
|
||||
|
||||
sut.ProbeOnce(); // clean 1 (no failback yet)
|
||||
sut.ProbeOnce(); // cleanProbes=1 — not yet at stableProbes=2
|
||||
Assert.Single(changes);
|
||||
|
||||
sut.ProbeOnce(); // clean 2 → failback
|
||||
sut.ProbeOnce(); // cleanProbes=2 → failback to Alarmmgr (mode change 2)
|
||||
|
||||
Assert.Equal(2, changes.Count);
|
||||
Assert.Equal(AlarmProviderMode.Alarmmgr, changes[^1].Mode);
|
||||
Assert.Equal(AlarmProviderMode.Alarmmgr, sut.Mode);
|
||||
Assert.Equal(0, changes[^1].HResult);
|
||||
|
||||
// ProbeOnce must not have called Subscribe at all during probing.
|
||||
Assert.Equal(subscribeCountAfterFailover, primary.SubscribeCount);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
@@ -185,6 +203,43 @@ public sealed class FailoverAlarmConsumerTests
|
||||
Assert.Single(forwarded);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Proves that <see cref="FailoverAlarmConsumer.ProbeOnce"/> never calls
|
||||
/// <c>Subscribe</c> on the primary while degraded. The production primary
|
||||
/// (<see cref="WnWrapAlarmConsumer"/>) is single-subscribe; a second
|
||||
/// <c>Subscribe</c> call would always throw and make failback impossible.
|
||||
/// The probe must re-poll the still-subscribed primary via
|
||||
/// <c>PollOnce</c> only.
|
||||
/// </summary>
|
||||
[Fact]
|
||||
public void ProbeOnce_DoesNotCallPrimarySubscribe()
|
||||
{
|
||||
// threshold=1 → first Subscribe failure immediately switches to Subtag.
|
||||
FlakyPrimary primary = new FlakyPrimary { ThrowOnPoll = true };
|
||||
StubStandby standby = new StubStandby();
|
||||
FailoverSettings settings = new FailoverSettings(threshold: 1, probeIntervalSeconds: 0, stableProbes: 3);
|
||||
using FailoverAlarmConsumer sut = new FailoverAlarmConsumer(primary, standby, settings);
|
||||
|
||||
sut.Subscribe(@"\\HOST\Galaxy!Area"); // Subscribe attempt #1 (throws) → Subtag
|
||||
|
||||
// Capture how many Subscribe calls the initial setup caused (exactly 1:
|
||||
// the attempt that threw and triggered failover).
|
||||
int subscribeCountAfterSetup = primary.SubscribeCount;
|
||||
Assert.Equal(1, subscribeCountAfterSetup);
|
||||
Assert.Equal(AlarmProviderMode.Subtag, sut.Mode);
|
||||
|
||||
// Let PollOnce succeed so ProbeOnce progresses without throwing.
|
||||
primary.ThrowOnPoll = false;
|
||||
|
||||
// Drive several ProbeOnce calls — none should touch Subscribe.
|
||||
sut.ProbeOnce();
|
||||
sut.ProbeOnce();
|
||||
sut.ProbeOnce(); // stableProbes=3 → failback on this call
|
||||
|
||||
Assert.Equal(AlarmProviderMode.Alarmmgr, sut.Mode);
|
||||
Assert.Equal(subscribeCountAfterSetup, primary.SubscribeCount);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Acknowledge_DelegatesToActiveChild()
|
||||
{
|
||||
|
||||
@@ -64,6 +64,13 @@ public sealed class FailoverAlarmConsumer : IMxAccessAlarmConsumer
|
||||
private int cleanProbes;
|
||||
private bool disposed;
|
||||
|
||||
/// <summary>
|
||||
/// The subscription expression passed to <see cref="Subscribe"/>.
|
||||
/// Stored for documentation and potential future full re-subscribe
|
||||
/// scenarios; the primary is NOT re-subscribed during probing.
|
||||
/// </summary>
|
||||
private string subscriptionExpression = string.Empty;
|
||||
|
||||
/// <summary>
|
||||
/// Composes the failover consumer over its two children.
|
||||
/// </summary>
|
||||
@@ -111,6 +118,10 @@ public sealed class FailoverAlarmConsumer : IMxAccessAlarmConsumer
|
||||
{
|
||||
if (disposed) throw new ObjectDisposedException(nameof(FailoverAlarmConsumer));
|
||||
|
||||
// Store for documentation; the primary is not torn down on failover
|
||||
// and is therefore not re-subscribed during ProbeOnce.
|
||||
subscriptionExpression = subscription;
|
||||
|
||||
// Arm the standby first so it is warm regardless of primary outcome.
|
||||
// A standby subscribe failure is a hard fault (the fallback itself is
|
||||
// broken) and is surfaced to the caller; it does not feed the primary
|
||||
@@ -155,14 +166,36 @@ public sealed class FailoverAlarmConsumer : IMxAccessAlarmConsumer
|
||||
/// Only meaningful while the standby is active; a no-op otherwise.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// A clean probe (primary <c>Subscribe</c> + <c>PollOnce</c> both
|
||||
/// succeed) increments the clean-probe counter and, once it reaches
|
||||
/// <see cref="FailoverSettings.StableProbes"/>, fails back to the
|
||||
/// primary. Any probe failure resets the clean-probe counter to 0 so
|
||||
/// the consumer requires a fresh unbroken run before failing back.
|
||||
/// Exposed publicly so tests (and any external scheduler honoring
|
||||
/// <see cref="FailoverSettings.ProbeIntervalSeconds"/> cadence) can
|
||||
/// drive it directly.
|
||||
/// <para>
|
||||
/// A clean probe (primary <c>PollOnce</c> succeeds without
|
||||
/// throwing) increments the clean-probe counter and, once it reaches
|
||||
/// <see cref="FailoverSettings.StableProbes"/>, fails back to the
|
||||
/// primary. Any probe failure resets the clean-probe counter to 0 so
|
||||
/// the consumer requires a fresh unbroken run before failing back.
|
||||
/// Exposed publicly so tests (and any external scheduler honoring
|
||||
/// <see cref="FailoverSettings.ProbeIntervalSeconds"/> cadence) can
|
||||
/// drive it directly.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// <strong>Why PollOnce only — no re-Subscribe.</strong>
|
||||
/// Failover does NOT tear down the primary's subscription;
|
||||
/// <see cref="WnWrapAlarmConsumer"/> is single-subscribe and would
|
||||
/// throw <see cref="InvalidOperationException"/> on a second call.
|
||||
/// The probe therefore re-polls the still-subscribed primary:
|
||||
/// when the underlying COM provider recovers, <c>PollOnce</c> stops
|
||||
/// throwing and clean probes accumulate toward failback. This covers
|
||||
/// the dominant failure mode (transient COM/provider fault after a
|
||||
/// successful initial subscribe).
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// <strong>Known v1 limitation.</strong> If the <em>original</em>
|
||||
/// <c>Subscribe</c> itself failed (i.e., the primary never reached a
|
||||
/// subscribed state — only reachable when
|
||||
/// <see cref="FailoverSettings.Threshold"/> is 1), polling alone
|
||||
/// cannot re-establish the subscription. That edge case is accepted
|
||||
/// for v1: the operator must restart the session to force a fresh
|
||||
/// subscribe attempt.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public void ProbeOnce()
|
||||
{
|
||||
@@ -171,7 +204,10 @@ public sealed class FailoverAlarmConsumer : IMxAccessAlarmConsumer
|
||||
|
||||
try
|
||||
{
|
||||
primary.Subscribe(string.Empty);
|
||||
// Re-poll the still-subscribed primary. Do NOT call Subscribe —
|
||||
// WnWrapAlarmConsumer is single-subscribe and the primary remains
|
||||
// subscribed across the failover; calling Subscribe again would
|
||||
// always throw InvalidOperationException and prevent failback.
|
||||
primary.PollOnce();
|
||||
}
|
||||
catch (Exception)
|
||||
|
||||
Reference in New Issue
Block a user