worker(alarms): failback probe re-polls the still-subscribed primary (no re-Subscribe)

This commit is contained in:
Joseph Doherty
2026-06-13 09:49:38 -04:00
parent 0a54c0bc4b
commit d6c0bb41ca
2 changed files with 104 additions and 13 deletions
@@ -64,6 +64,13 @@ public sealed class FailoverAlarmConsumer : IMxAccessAlarmConsumer
private int cleanProbes;
private bool disposed;
/// <summary>
/// The subscription expression passed to <see cref="Subscribe"/>.
/// Stored for documentation and potential future full re-subscribe
/// scenarios; the primary is NOT re-subscribed during probing.
/// </summary>
private string subscriptionExpression = string.Empty;
/// <summary>
/// Composes the failover consumer over its two children.
/// </summary>
@@ -111,6 +118,10 @@ public sealed class FailoverAlarmConsumer : IMxAccessAlarmConsumer
{
if (disposed) throw new ObjectDisposedException(nameof(FailoverAlarmConsumer));
// Store for documentation; the primary is not torn down on failover
// and is therefore not re-subscribed during ProbeOnce.
subscriptionExpression = subscription;
// Arm the standby first so it is warm regardless of primary outcome.
// A standby subscribe failure is a hard fault (the fallback itself is
// broken) and is surfaced to the caller; it does not feed the primary
@@ -155,14 +166,36 @@ public sealed class FailoverAlarmConsumer : IMxAccessAlarmConsumer
/// Only meaningful while the standby is active; a no-op otherwise.
/// </summary>
/// <remarks>
/// A clean probe (primary <c>Subscribe</c> + <c>PollOnce</c> both
/// succeed) increments the clean-probe counter and, once it reaches
/// <see cref="FailoverSettings.StableProbes"/>, fails back to the
/// primary. Any probe failure resets the clean-probe counter to 0 so
/// the consumer requires a fresh unbroken run before failing back.
/// Exposed publicly so tests (and any external scheduler honoring
/// <see cref="FailoverSettings.ProbeIntervalSeconds"/> cadence) can
/// drive it directly.
/// <para>
/// A clean probe (primary <c>PollOnce</c> succeeds without
/// throwing) increments the clean-probe counter and, once it reaches
/// <see cref="FailoverSettings.StableProbes"/>, fails back to the
/// primary. Any probe failure resets the clean-probe counter to 0 so
/// the consumer requires a fresh unbroken run before failing back.
/// Exposed publicly so tests (and any external scheduler honoring
/// <see cref="FailoverSettings.ProbeIntervalSeconds"/> cadence) can
/// drive it directly.
/// </para>
/// <para>
/// <strong>Why PollOnce only — no re-Subscribe.</strong>
/// Failover does NOT tear down the primary's subscription;
/// <see cref="WnWrapAlarmConsumer"/> is single-subscribe and would
/// throw <see cref="InvalidOperationException"/> on a second call.
/// The probe therefore re-polls the still-subscribed primary:
/// when the underlying COM provider recovers, <c>PollOnce</c> stops
/// throwing and clean probes accumulate toward failback. This covers
/// the dominant failure mode (transient COM/provider fault after a
/// successful initial subscribe).
/// </para>
/// <para>
/// <strong>Known v1 limitation.</strong> If the <em>original</em>
/// <c>Subscribe</c> itself failed (i.e., the primary never reached a
/// subscribed state — only reachable when
/// <see cref="FailoverSettings.Threshold"/> is 1), polling alone
/// cannot re-establish the subscription. That edge case is accepted
/// for v1: the operator must restart the session to force a fresh
/// subscribe attempt.
/// </para>
/// </remarks>
public void ProbeOnce()
{
@@ -171,7 +204,10 @@ public sealed class FailoverAlarmConsumer : IMxAccessAlarmConsumer
try
{
primary.Subscribe(string.Empty);
// Re-poll the still-subscribed primary. Do NOT call Subscribe —
// WnWrapAlarmConsumer is single-subscribe and the primary remains
// subscribed across the failover; calling Subscribe again would
// always throw InvalidOperationException and prevent failback.
primary.PollOnce();
}
catch (Exception)