fix(worker): resilient failover switch; FIPS-safe synthetic GUID; dup-reference guard + tests (Worker-026..028, Worker.Tests-031..033)

This commit is contained in:
Joseph Doherty
2026-06-15 02:56:15 -04:00
parent ddf2d84fbc
commit cebe67e9bd
8 changed files with 584 additions and 20 deletions
@@ -27,6 +27,15 @@ public sealed class FailoverAlarmConsumerTests
public event EventHandler<MxAlarmTransitionEvent>? AlarmTransitionEmitted;
public bool ThrowOnPoll = true;
/// <summary>
/// When set, <see cref="PollOnce"/> throws
/// <see cref="OutOfMemoryException"/> instead of a
/// <see cref="System.Runtime.InteropServices.COMException"/>, to
/// exercise the OOM-safe exception filter (Worker.Tests-032).
/// </summary>
public bool ThrowOutOfMemoryOnPoll;
public int Polls;
/// <summary>
@@ -48,6 +57,11 @@ public sealed class FailoverAlarmConsumerTests
public void PollOnce()
{
Polls++;
if (ThrowOutOfMemoryOnPoll)
{
throw new OutOfMemoryException("simulated allocation failure");
}
if (ThrowOnPoll)
{
throw new System.Runtime.InteropServices.COMException("boom", unchecked((int)0x80004005));
@@ -75,6 +89,15 @@ public sealed class FailoverAlarmConsumerTests
public bool Subscribed;
/// <summary>
/// When set, <see cref="SnapshotActiveAlarms"/> throws — modeling a
/// priming-snapshot failure during failover (Worker-026).
/// </summary>
public bool ThrowOnSnapshot;
/// <summary>Number of <see cref="SnapshotActiveAlarms"/> calls.</summary>
public int SnapshotCalls;
public void Subscribe(string s) => Subscribed = true;
public void PollOnce() { }
@@ -83,7 +106,16 @@ public sealed class FailoverAlarmConsumerTests
public int AcknowledgeByName(string n, string p, string gr, string c, string a, string b, string d, string e) => 22;
public IReadOnlyList<MxAlarmSnapshotRecord> SnapshotActiveAlarms() => Array.Empty<MxAlarmSnapshotRecord>();
public IReadOnlyList<MxAlarmSnapshotRecord> SnapshotActiveAlarms()
{
SnapshotCalls++;
if (ThrowOnSnapshot)
{
throw new InvalidOperationException("priming snapshot failed");
}
return Array.Empty<MxAlarmSnapshotRecord>();
}
public void Dispose() { }
@@ -291,4 +323,151 @@ public sealed class FailoverAlarmConsumerTests
sut.ProbeOnce(); // clean 3 → failback
Assert.Equal(AlarmProviderMode.Alarmmgr, sut.Mode);
}
/// <summary>
/// Worker-026 regression: when the standby's priming
/// <c>SnapshotActiveAlarms</c> throws during failover, the switch must
/// still (a) fire <c>ProviderModeChanged</c> so the gateway learns the
/// feed went degraded, (b) leave <see cref="FailoverAlarmConsumer.Mode"/>
/// in Subtag, and (c) not rethrow out of <c>PollOnce</c> (which on the
/// real STA would land in the poll loop's trailing catch and permanently
/// stop alarm delivery).
/// </summary>
[Fact]
public void Failover_WhenStandbyPrimingSnapshotThrows_StillRaisesModeChangeAndDoesNotRethrow()
{
FlakyPrimary primary = new FlakyPrimary { ThrowOnPoll = true };
StubStandby standby = new StubStandby { ThrowOnSnapshot = true };
FailoverSettings settings = new FailoverSettings(threshold: 1, probeIntervalSeconds: 0, stableProbes: 1);
using FailoverAlarmConsumer sut = new FailoverAlarmConsumer(primary, standby, settings);
List<AlarmProviderModeChange> changes = new List<AlarmProviderModeChange>();
sut.ProviderModeChanged += (_, e) => changes.Add(e);
// threshold=1 → the Subscribe failure triggers the switch, which primes
// the standby snapshot (throwing). The exception must be contained.
Exception? escaped = Record.Exception(() => sut.Subscribe(@"\\HOST\Galaxy!Area"));
Assert.Null(escaped);
Assert.Single(changes);
Assert.Equal(AlarmProviderMode.Subtag, changes[0].Mode);
Assert.Equal(AlarmProviderMode.Subtag, sut.Mode);
Assert.True(standby.SnapshotCalls >= 1); // priming was attempted
// A subsequent degraded PollOnce (standby.PollOnce + ProbeOnce) must also
// not rethrow the snapshot failure.
Exception? pollEscaped = Record.Exception(() => sut.PollOnce());
Assert.Null(pollEscaped);
}
/// <summary>
/// Worker-026 regression: when a <c>ProviderModeChanged</c> subscriber's
/// handler throws (modeling the AlarmCommandHandler's event-queue enqueue
/// overflowing at capacity), the switch must still take effect and the
/// exception must not escape the switch path into the poll loop.
/// </summary>
[Fact]
public void Failover_WhenModeChangedHandlerThrows_SwitchStillTakesEffectAndDoesNotRethrow()
{
FlakyPrimary primary = new FlakyPrimary { ThrowOnPoll = true };
StubStandby standby = new StubStandby();
FailoverSettings settings = new FailoverSettings(threshold: 1, probeIntervalSeconds: 0, stableProbes: 1);
using FailoverAlarmConsumer sut = new FailoverAlarmConsumer(primary, standby, settings);
int handlerInvocations = 0;
sut.ProviderModeChanged += (_, _) =>
{
handlerInvocations++;
throw new InvalidOperationException("subscriber handler blew up");
};
Exception? escaped = Record.Exception(() => sut.Subscribe(@"\\HOST\Galaxy!Area"));
Assert.Null(escaped);
Assert.Equal(1, handlerInvocations); // the event still fired
Assert.Equal(AlarmProviderMode.Subtag, sut.Mode); // the switch still took effect
}
/// <summary>
/// Worker.Tests-031 regression: with a non-zero
/// <see cref="FailoverSettings.ProbeIntervalSeconds"/>, two back-to-back
/// <c>ProbeOnce</c> calls must throttle — the second falls inside the
/// interval and must NOT re-poll the primary. Two consecutive calls
/// reliably fall inside any interval of one second or more, so this needs
/// no injected clock.
/// </summary>
[Fact]
public void ProbeOnce_WithNonZeroInterval_ThrottlesSecondProbeWithinInterval()
{
FlakyPrimary primary = new FlakyPrimary { ThrowOnPoll = true };
StubStandby standby = new StubStandby();
// stableProbes high enough that a single clean probe cannot fail back,
// so Mode stays Subtag and ProbeOnce remains the throttled path.
FailoverSettings settings = new FailoverSettings(threshold: 1, probeIntervalSeconds: 3600, stableProbes: 5);
using FailoverAlarmConsumer sut = new FailoverAlarmConsumer(primary, standby, settings);
sut.Subscribe(@"\\HOST\Galaxy!Area"); // threshold=1 → switch to Subtag
Assert.Equal(AlarmProviderMode.Subtag, sut.Mode);
primary.ThrowOnPoll = false; // primary healthy so a probe would poll cleanly
sut.ProbeOnce(); // first probe runs: re-polls the primary
int pollsAfterFirstProbe = primary.Polls;
Assert.Equal(1, pollsAfterFirstProbe);
sut.ProbeOnce(); // within the 3600s interval → throttled, must NOT re-poll
Assert.Equal(pollsAfterFirstProbe, primary.Polls);
Assert.Equal(AlarmProviderMode.Subtag, sut.Mode);
}
/// <summary>
/// Worker.Tests-032 regression: <c>RunPrimary</c>'s
/// <c>when (ex is not OutOfMemoryException)</c> filter must let an
/// <see cref="OutOfMemoryException"/> propagate rather than swallowing it
/// and counting it toward the failover threshold. No mode change must
/// fire — a fatal allocation failure is not a clean degraded handoff.
/// </summary>
[Fact]
public void RunPrimary_WhenPrimaryThrowsOutOfMemory_PropagatesAndDoesNotFailOver()
{
FlakyPrimary primary = new FlakyPrimary { ThrowOnPoll = false, ThrowOutOfMemoryOnPoll = true };
StubStandby standby = new StubStandby();
FailoverSettings settings = new FailoverSettings(threshold: 1, probeIntervalSeconds: 0, stableProbes: 1);
using FailoverAlarmConsumer sut = new FailoverAlarmConsumer(primary, standby, settings);
bool modeChanged = false;
sut.ProviderModeChanged += (_, _) => modeChanged = true;
sut.Subscribe(@"\\HOST\Galaxy!Area"); // Subscribe path does not poll; no throw here
Assert.Throws<OutOfMemoryException>(() => sut.PollOnce());
Assert.False(modeChanged);
Assert.Equal(AlarmProviderMode.Alarmmgr, sut.Mode);
}
/// <summary>
/// Worker.Tests-032 regression: <see cref="FailoverSettings"/> clamps
/// sub-1 <c>threshold</c> and <c>stableProbes</c> (and sub-0
/// <c>probeIntervalSeconds</c>) to their safe minimums so a misconfigured
/// bind cannot change failover semantics.
/// </summary>
[Theory]
[InlineData(0, 0, 0, 1, 0, 1)]
[InlineData(-5, -5, -5, 1, 0, 1)]
[InlineData(3, 7, 2, 3, 7, 2)]
public void FailoverSettings_ClampsSubMinimumValues(
int threshold,
int probeInterval,
int stableProbes,
int expectedThreshold,
int expectedProbeInterval,
int expectedStableProbes)
{
FailoverSettings settings = new FailoverSettings(threshold, probeInterval, stableProbes);
Assert.Equal(expectedThreshold, settings.Threshold);
Assert.Equal(expectedProbeInterval, settings.ProbeIntervalSeconds);
Assert.Equal(expectedStableProbes, settings.StableProbes);
}
}
@@ -109,6 +109,31 @@ public sealed class SubtagAlarmStateMachineTests
Assert.Throws<ArgumentException>(() => new SubtagAlarmStateMachine(new[] { first, second }));
}
/// <summary>
/// Worker-028 regression: two watch-list entries sharing an
/// <see cref="AlarmSubtagTarget.AlarmFullReference"/> (but using distinct
/// subtag addresses) must throw at construction, symmetric with the
/// duplicate-address guard, rather than silently overwriting the earlier
/// reference's state and orphaning its bound addresses.
/// </summary>
[Fact]
public void DuplicateAlarmFullReference_Throws()
{
var first = new AlarmSubtagTarget
{
AlarmFullReference = "Galaxy!Area.Tank01.Level.HiHi",
SourceObjectReference = "Tank01",
ActiveSubtag = "Tank01.Level.HiHi.active",
};
var second = new AlarmSubtagTarget
{
AlarmFullReference = "Galaxy!Area.Tank01.Level.HiHi",
SourceObjectReference = "Tank01",
ActiveSubtag = "Other.active",
};
Assert.Throws<ArgumentException>(() => new SubtagAlarmStateMachine(new[] { first, second }));
}
[Fact]
public void AckedTrueWhileActive_EmitsAck()
{
@@ -162,4 +187,64 @@ public sealed class SubtagAlarmStateMachineTests
var events = sm.Apply("Some.Other.Tag.active", true, DateTime.UtcNow);
Assert.Empty(events);
}
/// <summary>
/// Worker.Tests-033 regression: an ack arriving while the alarm is NOT
/// active must emit nothing and must NOT latch
/// <c>AckedDuringEpisode</c> — otherwise a stale ack from a prior episode
/// would mis-latch the next raise into a spurious ACK_RTN on clear. The
/// subsequent raise/clear must therefore still emit UNACK_RTN.
/// </summary>
[Fact]
public void AckedTrueWhileInactive_EmitsNothingAndDoesNotLatch()
{
var sm = new SubtagAlarmStateMachine(new[] { Target() });
var ts = new DateTime(2026, 6, 13, 9, 0, 0, DateTimeKind.Utc);
// Ack with no preceding active raise: must be a no-op.
var ackEvents = sm.Apply("Tank01.Level.HiHi.acked", true, ts);
Assert.Empty(ackEvents);
// A fresh episode: raise then clear. Because the earlier ack must not
// have latched AckedDuringEpisode, the clear must be UNACK_RTN.
sm.Apply("Tank01.Level.HiHi.active", true, ts.AddSeconds(5));
var clearEvents = sm.Apply("Tank01.Level.HiHi.active", false, ts.AddSeconds(10));
var clear = Assert.Single(clearEvents);
Assert.Equal(MxAlarmStateKind.UnackRtn, clear.Record.State);
}
/// <summary>
/// Worker.Tests-033 regression: a priority-subtag value change must flow
/// through <c>CoerceInt</c> into the emitted record's
/// <see cref="MxAlarmSnapshotRecord.Priority"/>. A non-numeric value must
/// leave the prior priority unchanged (the CoerceInt fallback path).
/// </summary>
[Fact]
public void PriorityChange_FlowsIntoEmittedRecord()
{
var target = new AlarmSubtagTarget
{
AlarmFullReference = "Galaxy!Area.Tank01.Level.HiHi",
SourceObjectReference = "Tank01",
ActiveSubtag = "Tank01.Level.HiHi.active",
AckedSubtag = "Tank01.Level.HiHi.acked",
PrioritySubtag = "Tank01.Level.HiHi.priority",
};
var sm = new SubtagAlarmStateMachine(new[] { target });
var ts = new DateTime(2026, 6, 13, 9, 0, 0, DateTimeKind.Utc);
// A priority change alone emits nothing but records the priority.
var priorityEvents = sm.Apply("Tank01.Level.HiHi.priority", 750, ts);
Assert.Empty(priorityEvents);
// Raise: the emitted record carries the recorded priority.
var raiseEvents = sm.Apply("Tank01.Level.HiHi.active", true, ts.AddSeconds(1));
var raise = Assert.Single(raiseEvents);
Assert.Equal(750, raise.Record.Priority);
// A non-numeric priority must fall back to the existing value, not zero.
sm.Apply("Tank01.Level.HiHi.priority", "not-a-number", ts.AddSeconds(2));
var snap = Assert.Single(sm.SnapshotActive());
Assert.Equal(750, snap.Priority);
}
}
@@ -24,4 +24,43 @@ public sealed class SyntheticAlarmGuidTests
[Fact]
public void Reference_ProducesNonEmptyGuid() =>
Assert.NotEqual(Guid.Empty, SyntheticAlarmGuid.ForReference("A.B.C"));
/// <summary>
/// Verifies the empty string still derives a non-empty GUID. The length
/// fold in the derivation prevents a degenerate all-zero (Guid.Empty)
/// result, which would collide with the unset-record default downstream.
/// </summary>
[Fact]
public void EmptyReference_ProducesNonEmptyGuid() =>
Assert.NotEqual(Guid.Empty, SyntheticAlarmGuid.ForReference(string.Empty));
/// <summary>
/// Worker-027 regression: <see cref="SyntheticAlarmGuid.ForReference"/>
/// must derive its GUID without routing through
/// <see cref="System.Security.Cryptography"/>, because on net48
/// <c>MD5.Create()</c> throws under the Windows FIPS-compliance policy.
/// This test enables the per-AppContext FIPS-enforcement switch (which the
/// managed crypto factories honour) and asserts the derivation still
/// succeeds deterministically — a regression that reintroduced a FIPS-gated
/// provider would throw here instead of returning a stable GUID.
/// </summary>
[Fact]
public void ForReference_UnderFipsEnforcement_DoesNotThrowAndStaysDeterministic()
{
const string switchName = "Switch.System.Security.Cryptography.UseLegacyFipsThrow";
bool original = AppContext.TryGetSwitch(switchName, out bool value) && value;
AppContext.SetSwitch(switchName, true);
try
{
Guid first = SyntheticAlarmGuid.ForReference("Galaxy!Area.Tank01.Level.HiHi");
Guid second = SyntheticAlarmGuid.ForReference("Galaxy!Area.Tank01.Level.HiHi");
Assert.NotEqual(Guid.Empty, first);
Assert.Equal(first, second);
}
finally
{
AppContext.SetSwitch(switchName, original);
}
}
}