fix(worker): resilient failover switch; FIPS-safe synthetic GUID; dup-reference guard + tests (Worker-026..028, Worker.Tests-031..033)
This commit is contained in:
@@ -27,6 +27,15 @@ public sealed class FailoverAlarmConsumerTests
|
||||
public event EventHandler<MxAlarmTransitionEvent>? AlarmTransitionEmitted;
|
||||
|
||||
public bool ThrowOnPoll = true;
|
||||
|
||||
/// <summary>
|
||||
/// When set, <see cref="PollOnce"/> throws
|
||||
/// <see cref="OutOfMemoryException"/> instead of a
|
||||
/// <see cref="System.Runtime.InteropServices.COMException"/>, to
|
||||
/// exercise the OOM-safe exception filter (Worker.Tests-032).
|
||||
/// </summary>
|
||||
public bool ThrowOutOfMemoryOnPoll;
|
||||
|
||||
public int Polls;
|
||||
|
||||
/// <summary>
|
||||
@@ -48,6 +57,11 @@ public sealed class FailoverAlarmConsumerTests
|
||||
public void PollOnce()
|
||||
{
|
||||
Polls++;
|
||||
if (ThrowOutOfMemoryOnPoll)
|
||||
{
|
||||
throw new OutOfMemoryException("simulated allocation failure");
|
||||
}
|
||||
|
||||
if (ThrowOnPoll)
|
||||
{
|
||||
throw new System.Runtime.InteropServices.COMException("boom", unchecked((int)0x80004005));
|
||||
@@ -75,6 +89,15 @@ public sealed class FailoverAlarmConsumerTests
|
||||
|
||||
public bool Subscribed;
|
||||
|
||||
/// <summary>
|
||||
/// When set, <see cref="SnapshotActiveAlarms"/> throws — modeling a
|
||||
/// priming-snapshot failure during failover (Worker-026).
|
||||
/// </summary>
|
||||
public bool ThrowOnSnapshot;
|
||||
|
||||
/// <summary>Number of <see cref="SnapshotActiveAlarms"/> calls.</summary>
|
||||
public int SnapshotCalls;
|
||||
|
||||
public void Subscribe(string s) => Subscribed = true;
|
||||
|
||||
public void PollOnce() { }
|
||||
@@ -83,7 +106,16 @@ public sealed class FailoverAlarmConsumerTests
|
||||
|
||||
public int AcknowledgeByName(string n, string p, string gr, string c, string a, string b, string d, string e) => 22;
|
||||
|
||||
public IReadOnlyList<MxAlarmSnapshotRecord> SnapshotActiveAlarms() => Array.Empty<MxAlarmSnapshotRecord>();
|
||||
public IReadOnlyList<MxAlarmSnapshotRecord> SnapshotActiveAlarms()
|
||||
{
|
||||
SnapshotCalls++;
|
||||
if (ThrowOnSnapshot)
|
||||
{
|
||||
throw new InvalidOperationException("priming snapshot failed");
|
||||
}
|
||||
|
||||
return Array.Empty<MxAlarmSnapshotRecord>();
|
||||
}
|
||||
|
||||
public void Dispose() { }
|
||||
|
||||
@@ -291,4 +323,151 @@ public sealed class FailoverAlarmConsumerTests
|
||||
sut.ProbeOnce(); // clean 3 → failback
|
||||
Assert.Equal(AlarmProviderMode.Alarmmgr, sut.Mode);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Worker-026 regression: when the standby's priming
|
||||
/// <c>SnapshotActiveAlarms</c> throws during failover, the switch must
|
||||
/// still (a) fire <c>ProviderModeChanged</c> so the gateway learns the
|
||||
/// feed went degraded, (b) leave <see cref="FailoverAlarmConsumer.Mode"/>
|
||||
/// in Subtag, and (c) not rethrow out of <c>PollOnce</c> (which on the
|
||||
/// real STA would land in the poll loop's trailing catch and permanently
|
||||
/// stop alarm delivery).
|
||||
/// </summary>
|
||||
[Fact]
|
||||
public void Failover_WhenStandbyPrimingSnapshotThrows_StillRaisesModeChangeAndDoesNotRethrow()
|
||||
{
|
||||
FlakyPrimary primary = new FlakyPrimary { ThrowOnPoll = true };
|
||||
StubStandby standby = new StubStandby { ThrowOnSnapshot = true };
|
||||
FailoverSettings settings = new FailoverSettings(threshold: 1, probeIntervalSeconds: 0, stableProbes: 1);
|
||||
using FailoverAlarmConsumer sut = new FailoverAlarmConsumer(primary, standby, settings);
|
||||
|
||||
List<AlarmProviderModeChange> changes = new List<AlarmProviderModeChange>();
|
||||
sut.ProviderModeChanged += (_, e) => changes.Add(e);
|
||||
|
||||
// threshold=1 → the Subscribe failure triggers the switch, which primes
|
||||
// the standby snapshot (throwing). The exception must be contained.
|
||||
Exception? escaped = Record.Exception(() => sut.Subscribe(@"\\HOST\Galaxy!Area"));
|
||||
|
||||
Assert.Null(escaped);
|
||||
Assert.Single(changes);
|
||||
Assert.Equal(AlarmProviderMode.Subtag, changes[0].Mode);
|
||||
Assert.Equal(AlarmProviderMode.Subtag, sut.Mode);
|
||||
Assert.True(standby.SnapshotCalls >= 1); // priming was attempted
|
||||
|
||||
// A subsequent degraded PollOnce (standby.PollOnce + ProbeOnce) must also
|
||||
// not rethrow the snapshot failure.
|
||||
Exception? pollEscaped = Record.Exception(() => sut.PollOnce());
|
||||
Assert.Null(pollEscaped);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Worker-026 regression: when a <c>ProviderModeChanged</c> subscriber's
|
||||
/// handler throws (modeling the AlarmCommandHandler's event-queue enqueue
|
||||
/// overflowing at capacity), the switch must still take effect and the
|
||||
/// exception must not escape the switch path into the poll loop.
|
||||
/// </summary>
|
||||
[Fact]
|
||||
public void Failover_WhenModeChangedHandlerThrows_SwitchStillTakesEffectAndDoesNotRethrow()
|
||||
{
|
||||
FlakyPrimary primary = new FlakyPrimary { ThrowOnPoll = true };
|
||||
StubStandby standby = new StubStandby();
|
||||
FailoverSettings settings = new FailoverSettings(threshold: 1, probeIntervalSeconds: 0, stableProbes: 1);
|
||||
using FailoverAlarmConsumer sut = new FailoverAlarmConsumer(primary, standby, settings);
|
||||
|
||||
int handlerInvocations = 0;
|
||||
sut.ProviderModeChanged += (_, _) =>
|
||||
{
|
||||
handlerInvocations++;
|
||||
throw new InvalidOperationException("subscriber handler blew up");
|
||||
};
|
||||
|
||||
Exception? escaped = Record.Exception(() => sut.Subscribe(@"\\HOST\Galaxy!Area"));
|
||||
|
||||
Assert.Null(escaped);
|
||||
Assert.Equal(1, handlerInvocations); // the event still fired
|
||||
Assert.Equal(AlarmProviderMode.Subtag, sut.Mode); // the switch still took effect
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Worker.Tests-031 regression: with a non-zero
|
||||
/// <see cref="FailoverSettings.ProbeIntervalSeconds"/>, two back-to-back
|
||||
/// <c>ProbeOnce</c> calls must throttle — the second falls inside the
|
||||
/// interval and must NOT re-poll the primary. Two consecutive calls
|
||||
/// reliably fall inside any interval of one second or more, so this needs
|
||||
/// no injected clock.
|
||||
/// </summary>
|
||||
[Fact]
|
||||
public void ProbeOnce_WithNonZeroInterval_ThrottlesSecondProbeWithinInterval()
|
||||
{
|
||||
FlakyPrimary primary = new FlakyPrimary { ThrowOnPoll = true };
|
||||
StubStandby standby = new StubStandby();
|
||||
// stableProbes high enough that a single clean probe cannot fail back,
|
||||
// so Mode stays Subtag and ProbeOnce remains the throttled path.
|
||||
FailoverSettings settings = new FailoverSettings(threshold: 1, probeIntervalSeconds: 3600, stableProbes: 5);
|
||||
using FailoverAlarmConsumer sut = new FailoverAlarmConsumer(primary, standby, settings);
|
||||
|
||||
sut.Subscribe(@"\\HOST\Galaxy!Area"); // threshold=1 → switch to Subtag
|
||||
Assert.Equal(AlarmProviderMode.Subtag, sut.Mode);
|
||||
|
||||
primary.ThrowOnPoll = false; // primary healthy so a probe would poll cleanly
|
||||
|
||||
sut.ProbeOnce(); // first probe runs: re-polls the primary
|
||||
int pollsAfterFirstProbe = primary.Polls;
|
||||
Assert.Equal(1, pollsAfterFirstProbe);
|
||||
|
||||
sut.ProbeOnce(); // within the 3600s interval → throttled, must NOT re-poll
|
||||
|
||||
Assert.Equal(pollsAfterFirstProbe, primary.Polls);
|
||||
Assert.Equal(AlarmProviderMode.Subtag, sut.Mode);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Worker.Tests-032 regression: <c>RunPrimary</c>'s
|
||||
/// <c>when (ex is not OutOfMemoryException)</c> filter must let an
|
||||
/// <see cref="OutOfMemoryException"/> propagate rather than swallowing it
|
||||
/// and counting it toward the failover threshold. No mode change must
|
||||
/// fire — a fatal allocation failure is not a clean degraded handoff.
|
||||
/// </summary>
|
||||
[Fact]
|
||||
public void RunPrimary_WhenPrimaryThrowsOutOfMemory_PropagatesAndDoesNotFailOver()
|
||||
{
|
||||
FlakyPrimary primary = new FlakyPrimary { ThrowOnPoll = false, ThrowOutOfMemoryOnPoll = true };
|
||||
StubStandby standby = new StubStandby();
|
||||
FailoverSettings settings = new FailoverSettings(threshold: 1, probeIntervalSeconds: 0, stableProbes: 1);
|
||||
using FailoverAlarmConsumer sut = new FailoverAlarmConsumer(primary, standby, settings);
|
||||
|
||||
bool modeChanged = false;
|
||||
sut.ProviderModeChanged += (_, _) => modeChanged = true;
|
||||
|
||||
sut.Subscribe(@"\\HOST\Galaxy!Area"); // Subscribe path does not poll; no throw here
|
||||
|
||||
Assert.Throws<OutOfMemoryException>(() => sut.PollOnce());
|
||||
Assert.False(modeChanged);
|
||||
Assert.Equal(AlarmProviderMode.Alarmmgr, sut.Mode);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Worker.Tests-032 regression: <see cref="FailoverSettings"/> clamps
|
||||
/// sub-1 <c>threshold</c> and <c>stableProbes</c> (and sub-0
|
||||
/// <c>probeIntervalSeconds</c>) to their safe minimums so a misconfigured
|
||||
/// bind cannot change failover semantics.
|
||||
/// </summary>
|
||||
[Theory]
|
||||
[InlineData(0, 0, 0, 1, 0, 1)]
|
||||
[InlineData(-5, -5, -5, 1, 0, 1)]
|
||||
[InlineData(3, 7, 2, 3, 7, 2)]
|
||||
public void FailoverSettings_ClampsSubMinimumValues(
|
||||
int threshold,
|
||||
int probeInterval,
|
||||
int stableProbes,
|
||||
int expectedThreshold,
|
||||
int expectedProbeInterval,
|
||||
int expectedStableProbes)
|
||||
{
|
||||
FailoverSettings settings = new FailoverSettings(threshold, probeInterval, stableProbes);
|
||||
|
||||
Assert.Equal(expectedThreshold, settings.Threshold);
|
||||
Assert.Equal(expectedProbeInterval, settings.ProbeIntervalSeconds);
|
||||
Assert.Equal(expectedStableProbes, settings.StableProbes);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -109,6 +109,31 @@ public sealed class SubtagAlarmStateMachineTests
|
||||
Assert.Throws<ArgumentException>(() => new SubtagAlarmStateMachine(new[] { first, second }));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Worker-028 regression: two watch-list entries sharing an
|
||||
/// <see cref="AlarmSubtagTarget.AlarmFullReference"/> (but using distinct
|
||||
/// subtag addresses) must throw at construction, symmetric with the
|
||||
/// duplicate-address guard, rather than silently overwriting the earlier
|
||||
/// reference's state and orphaning its bound addresses.
|
||||
/// </summary>
|
||||
[Fact]
|
||||
public void DuplicateAlarmFullReference_Throws()
|
||||
{
|
||||
var first = new AlarmSubtagTarget
|
||||
{
|
||||
AlarmFullReference = "Galaxy!Area.Tank01.Level.HiHi",
|
||||
SourceObjectReference = "Tank01",
|
||||
ActiveSubtag = "Tank01.Level.HiHi.active",
|
||||
};
|
||||
var second = new AlarmSubtagTarget
|
||||
{
|
||||
AlarmFullReference = "Galaxy!Area.Tank01.Level.HiHi",
|
||||
SourceObjectReference = "Tank01",
|
||||
ActiveSubtag = "Other.active",
|
||||
};
|
||||
Assert.Throws<ArgumentException>(() => new SubtagAlarmStateMachine(new[] { first, second }));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void AckedTrueWhileActive_EmitsAck()
|
||||
{
|
||||
@@ -162,4 +187,64 @@ public sealed class SubtagAlarmStateMachineTests
|
||||
var events = sm.Apply("Some.Other.Tag.active", true, DateTime.UtcNow);
|
||||
Assert.Empty(events);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Worker.Tests-033 regression: an ack arriving while the alarm is NOT
|
||||
/// active must emit nothing and must NOT latch
|
||||
/// <c>AckedDuringEpisode</c> — otherwise a stale ack from a prior episode
|
||||
/// would mis-latch the next raise into a spurious ACK_RTN on clear. The
|
||||
/// subsequent raise/clear must therefore still emit UNACK_RTN.
|
||||
/// </summary>
|
||||
[Fact]
|
||||
public void AckedTrueWhileInactive_EmitsNothingAndDoesNotLatch()
|
||||
{
|
||||
var sm = new SubtagAlarmStateMachine(new[] { Target() });
|
||||
var ts = new DateTime(2026, 6, 13, 9, 0, 0, DateTimeKind.Utc);
|
||||
|
||||
// Ack with no preceding active raise: must be a no-op.
|
||||
var ackEvents = sm.Apply("Tank01.Level.HiHi.acked", true, ts);
|
||||
Assert.Empty(ackEvents);
|
||||
|
||||
// A fresh episode: raise then clear. Because the earlier ack must not
|
||||
// have latched AckedDuringEpisode, the clear must be UNACK_RTN.
|
||||
sm.Apply("Tank01.Level.HiHi.active", true, ts.AddSeconds(5));
|
||||
var clearEvents = sm.Apply("Tank01.Level.HiHi.active", false, ts.AddSeconds(10));
|
||||
var clear = Assert.Single(clearEvents);
|
||||
Assert.Equal(MxAlarmStateKind.UnackRtn, clear.Record.State);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Worker.Tests-033 regression: a priority-subtag value change must flow
|
||||
/// through <c>CoerceInt</c> into the emitted record's
|
||||
/// <see cref="MxAlarmSnapshotRecord.Priority"/>. A non-numeric value must
|
||||
/// leave the prior priority unchanged (the CoerceInt fallback path).
|
||||
/// </summary>
|
||||
[Fact]
|
||||
public void PriorityChange_FlowsIntoEmittedRecord()
|
||||
{
|
||||
var target = new AlarmSubtagTarget
|
||||
{
|
||||
AlarmFullReference = "Galaxy!Area.Tank01.Level.HiHi",
|
||||
SourceObjectReference = "Tank01",
|
||||
ActiveSubtag = "Tank01.Level.HiHi.active",
|
||||
AckedSubtag = "Tank01.Level.HiHi.acked",
|
||||
PrioritySubtag = "Tank01.Level.HiHi.priority",
|
||||
};
|
||||
var sm = new SubtagAlarmStateMachine(new[] { target });
|
||||
var ts = new DateTime(2026, 6, 13, 9, 0, 0, DateTimeKind.Utc);
|
||||
|
||||
// A priority change alone emits nothing but records the priority.
|
||||
var priorityEvents = sm.Apply("Tank01.Level.HiHi.priority", 750, ts);
|
||||
Assert.Empty(priorityEvents);
|
||||
|
||||
// Raise: the emitted record carries the recorded priority.
|
||||
var raiseEvents = sm.Apply("Tank01.Level.HiHi.active", true, ts.AddSeconds(1));
|
||||
var raise = Assert.Single(raiseEvents);
|
||||
Assert.Equal(750, raise.Record.Priority);
|
||||
|
||||
// A non-numeric priority must fall back to the existing value, not zero.
|
||||
sm.Apply("Tank01.Level.HiHi.priority", "not-a-number", ts.AddSeconds(2));
|
||||
var snap = Assert.Single(sm.SnapshotActive());
|
||||
Assert.Equal(750, snap.Priority);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -24,4 +24,43 @@ public sealed class SyntheticAlarmGuidTests
|
||||
[Fact]
|
||||
public void Reference_ProducesNonEmptyGuid() =>
|
||||
Assert.NotEqual(Guid.Empty, SyntheticAlarmGuid.ForReference("A.B.C"));
|
||||
|
||||
/// <summary>
|
||||
/// Verifies the empty string still derives a non-empty GUID. The length
|
||||
/// fold in the derivation prevents a degenerate all-zero (Guid.Empty)
|
||||
/// result, which would collide with the unset-record default downstream.
|
||||
/// </summary>
|
||||
[Fact]
|
||||
public void EmptyReference_ProducesNonEmptyGuid() =>
|
||||
Assert.NotEqual(Guid.Empty, SyntheticAlarmGuid.ForReference(string.Empty));
|
||||
|
||||
/// <summary>
|
||||
/// Worker-027 regression: <see cref="SyntheticAlarmGuid.ForReference"/>
|
||||
/// must derive its GUID without routing through
|
||||
/// <see cref="System.Security.Cryptography"/>, because on net48
|
||||
/// <c>MD5.Create()</c> throws under the Windows FIPS-compliance policy.
|
||||
/// This test enables the per-AppContext FIPS-enforcement switch (which the
|
||||
/// managed crypto factories honour) and asserts the derivation still
|
||||
/// succeeds deterministically — a regression that reintroduced a FIPS-gated
|
||||
/// provider would throw here instead of returning a stable GUID.
|
||||
/// </summary>
|
||||
[Fact]
|
||||
public void ForReference_UnderFipsEnforcement_DoesNotThrowAndStaysDeterministic()
|
||||
{
|
||||
const string switchName = "Switch.System.Security.Cryptography.UseLegacyFipsThrow";
|
||||
bool original = AppContext.TryGetSwitch(switchName, out bool value) && value;
|
||||
AppContext.SetSwitch(switchName, true);
|
||||
try
|
||||
{
|
||||
Guid first = SyntheticAlarmGuid.ForReference("Galaxy!Area.Tank01.Level.HiHi");
|
||||
Guid second = SyntheticAlarmGuid.ForReference("Galaxy!Area.Tank01.Level.HiHi");
|
||||
|
||||
Assert.NotEqual(Guid.Empty, first);
|
||||
Assert.Equal(first, second);
|
||||
}
|
||||
finally
|
||||
{
|
||||
AppContext.SetSwitch(switchName, original);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user