fix(worker): resilient failover switch; FIPS-safe synthetic GUID; dup-reference guard + tests (Worker-026..028, Worker.Tests-031..033)

This commit is contained in:
Joseph Doherty
2026-06-15 02:56:15 -04:00
parent ddf2d84fbc
commit cebe67e9bd
8 changed files with 584 additions and 20 deletions
@@ -27,6 +27,15 @@ public sealed class FailoverAlarmConsumerTests
public event EventHandler<MxAlarmTransitionEvent>? AlarmTransitionEmitted;
public bool ThrowOnPoll = true;
/// <summary>
/// When set, <see cref="PollOnce"/> throws
/// <see cref="OutOfMemoryException"/> instead of a
/// <see cref="System.Runtime.InteropServices.COMException"/>, to
/// exercise the OOM-safe exception filter (Worker.Tests-032).
/// </summary>
public bool ThrowOutOfMemoryOnPoll;
public int Polls;
/// <summary>
@@ -48,6 +57,11 @@ public sealed class FailoverAlarmConsumerTests
public void PollOnce()
{
Polls++;
if (ThrowOutOfMemoryOnPoll)
{
throw new OutOfMemoryException("simulated allocation failure");
}
if (ThrowOnPoll)
{
throw new System.Runtime.InteropServices.COMException("boom", unchecked((int)0x80004005));
@@ -75,6 +89,15 @@ public sealed class FailoverAlarmConsumerTests
public bool Subscribed;
/// <summary>
/// When set, <see cref="SnapshotActiveAlarms"/> throws — modeling a
/// priming-snapshot failure during failover (Worker-026).
/// </summary>
public bool ThrowOnSnapshot;
/// <summary>Number of <see cref="SnapshotActiveAlarms"/> calls.</summary>
public int SnapshotCalls;
public void Subscribe(string s) => Subscribed = true;
public void PollOnce() { }
@@ -83,7 +106,16 @@ public sealed class FailoverAlarmConsumerTests
public int AcknowledgeByName(string n, string p, string gr, string c, string a, string b, string d, string e) => 22;
public IReadOnlyList<MxAlarmSnapshotRecord> SnapshotActiveAlarms() => Array.Empty<MxAlarmSnapshotRecord>();
public IReadOnlyList<MxAlarmSnapshotRecord> SnapshotActiveAlarms()
{
SnapshotCalls++;
if (ThrowOnSnapshot)
{
throw new InvalidOperationException("priming snapshot failed");
}
return Array.Empty<MxAlarmSnapshotRecord>();
}
public void Dispose() { }
@@ -291,4 +323,151 @@ public sealed class FailoverAlarmConsumerTests
sut.ProbeOnce(); // clean 3 → failback
Assert.Equal(AlarmProviderMode.Alarmmgr, sut.Mode);
}
/// <summary>
/// Worker-026 regression: when the standby's priming
/// <c>SnapshotActiveAlarms</c> throws during failover, the switch must
/// still (a) fire <c>ProviderModeChanged</c> so the gateway learns the
/// feed went degraded, (b) leave <see cref="FailoverAlarmConsumer.Mode"/>
/// in Subtag, and (c) not rethrow out of <c>PollOnce</c> (which on the
/// real STA would land in the poll loop's trailing catch and permanently
/// stop alarm delivery).
/// </summary>
[Fact]
public void Failover_WhenStandbyPrimingSnapshotThrows_StillRaisesModeChangeAndDoesNotRethrow()
{
FlakyPrimary primary = new FlakyPrimary { ThrowOnPoll = true };
StubStandby standby = new StubStandby { ThrowOnSnapshot = true };
FailoverSettings settings = new FailoverSettings(threshold: 1, probeIntervalSeconds: 0, stableProbes: 1);
using FailoverAlarmConsumer sut = new FailoverAlarmConsumer(primary, standby, settings);
List<AlarmProviderModeChange> changes = new List<AlarmProviderModeChange>();
sut.ProviderModeChanged += (_, e) => changes.Add(e);
// threshold=1 → the Subscribe failure triggers the switch, which primes
// the standby snapshot (throwing). The exception must be contained.
Exception? escaped = Record.Exception(() => sut.Subscribe(@"\\HOST\Galaxy!Area"));
Assert.Null(escaped);
Assert.Single(changes);
Assert.Equal(AlarmProviderMode.Subtag, changes[0].Mode);
Assert.Equal(AlarmProviderMode.Subtag, sut.Mode);
Assert.True(standby.SnapshotCalls >= 1); // priming was attempted
// A subsequent degraded PollOnce (standby.PollOnce + ProbeOnce) must also
// not rethrow the snapshot failure.
Exception? pollEscaped = Record.Exception(() => sut.PollOnce());
Assert.Null(pollEscaped);
}
/// <summary>
/// Worker-026 regression: when a <c>ProviderModeChanged</c> subscriber's
/// handler throws (modeling the AlarmCommandHandler's event-queue enqueue
/// overflowing at capacity), the switch must still take effect and the
/// exception must not escape the switch path into the poll loop.
/// </summary>
[Fact]
public void Failover_WhenModeChangedHandlerThrows_SwitchStillTakesEffectAndDoesNotRethrow()
{
FlakyPrimary primary = new FlakyPrimary { ThrowOnPoll = true };
StubStandby standby = new StubStandby();
FailoverSettings settings = new FailoverSettings(threshold: 1, probeIntervalSeconds: 0, stableProbes: 1);
using FailoverAlarmConsumer sut = new FailoverAlarmConsumer(primary, standby, settings);
int handlerInvocations = 0;
sut.ProviderModeChanged += (_, _) =>
{
handlerInvocations++;
throw new InvalidOperationException("subscriber handler blew up");
};
Exception? escaped = Record.Exception(() => sut.Subscribe(@"\\HOST\Galaxy!Area"));
Assert.Null(escaped);
Assert.Equal(1, handlerInvocations); // the event still fired
Assert.Equal(AlarmProviderMode.Subtag, sut.Mode); // the switch still took effect
}
/// <summary>
/// Worker.Tests-031 regression: with a non-zero
/// <see cref="FailoverSettings.ProbeIntervalSeconds"/>, two back-to-back
/// <c>ProbeOnce</c> calls must throttle — the second falls inside the
/// interval and must NOT re-poll the primary. Two consecutive calls
/// reliably fall inside any interval of one second or more, so this needs
/// no injected clock.
/// </summary>
[Fact]
public void ProbeOnce_WithNonZeroInterval_ThrottlesSecondProbeWithinInterval()
{
FlakyPrimary primary = new FlakyPrimary { ThrowOnPoll = true };
StubStandby standby = new StubStandby();
// stableProbes high enough that a single clean probe cannot fail back,
// so Mode stays Subtag and ProbeOnce remains the throttled path.
FailoverSettings settings = new FailoverSettings(threshold: 1, probeIntervalSeconds: 3600, stableProbes: 5);
using FailoverAlarmConsumer sut = new FailoverAlarmConsumer(primary, standby, settings);
sut.Subscribe(@"\\HOST\Galaxy!Area"); // threshold=1 → switch to Subtag
Assert.Equal(AlarmProviderMode.Subtag, sut.Mode);
primary.ThrowOnPoll = false; // primary healthy so a probe would poll cleanly
sut.ProbeOnce(); // first probe runs: re-polls the primary
int pollsAfterFirstProbe = primary.Polls;
Assert.Equal(1, pollsAfterFirstProbe);
sut.ProbeOnce(); // within the 3600s interval → throttled, must NOT re-poll
Assert.Equal(pollsAfterFirstProbe, primary.Polls);
Assert.Equal(AlarmProviderMode.Subtag, sut.Mode);
}
/// <summary>
/// Worker.Tests-032 regression: <c>RunPrimary</c>'s
/// <c>when (ex is not OutOfMemoryException)</c> filter must let an
/// <see cref="OutOfMemoryException"/> propagate rather than swallowing it
/// and counting it toward the failover threshold. No mode change must
/// fire — a fatal allocation failure is not a clean degraded handoff.
/// </summary>
[Fact]
public void RunPrimary_WhenPrimaryThrowsOutOfMemory_PropagatesAndDoesNotFailOver()
{
FlakyPrimary primary = new FlakyPrimary { ThrowOnPoll = false, ThrowOutOfMemoryOnPoll = true };
StubStandby standby = new StubStandby();
FailoverSettings settings = new FailoverSettings(threshold: 1, probeIntervalSeconds: 0, stableProbes: 1);
using FailoverAlarmConsumer sut = new FailoverAlarmConsumer(primary, standby, settings);
bool modeChanged = false;
sut.ProviderModeChanged += (_, _) => modeChanged = true;
sut.Subscribe(@"\\HOST\Galaxy!Area"); // Subscribe path does not poll; no throw here
Assert.Throws<OutOfMemoryException>(() => sut.PollOnce());
Assert.False(modeChanged);
Assert.Equal(AlarmProviderMode.Alarmmgr, sut.Mode);
}
/// <summary>
/// Worker.Tests-032 regression: <see cref="FailoverSettings"/> clamps
/// sub-1 <c>threshold</c> and <c>stableProbes</c> (and sub-0
/// <c>probeIntervalSeconds</c>) to their safe minimums so a misconfigured
/// bind cannot change failover semantics.
/// </summary>
[Theory]
[InlineData(0, 0, 0, 1, 0, 1)]
[InlineData(-5, -5, -5, 1, 0, 1)]
[InlineData(3, 7, 2, 3, 7, 2)]
public void FailoverSettings_ClampsSubMinimumValues(
int threshold,
int probeInterval,
int stableProbes,
int expectedThreshold,
int expectedProbeInterval,
int expectedStableProbes)
{
FailoverSettings settings = new FailoverSettings(threshold, probeInterval, stableProbes);
Assert.Equal(expectedThreshold, settings.Threshold);
Assert.Equal(expectedProbeInterval, settings.ProbeIntervalSeconds);
Assert.Equal(expectedStableProbes, settings.StableProbes);
}
}
@@ -109,6 +109,31 @@ public sealed class SubtagAlarmStateMachineTests
Assert.Throws<ArgumentException>(() => new SubtagAlarmStateMachine(new[] { first, second }));
}
/// <summary>
/// Worker-028 regression: two watch-list entries sharing an
/// <see cref="AlarmSubtagTarget.AlarmFullReference"/> (but using distinct
/// subtag addresses) must throw at construction, symmetric with the
/// duplicate-address guard, rather than silently overwriting the earlier
/// reference's state and orphaning its bound addresses.
/// </summary>
[Fact]
public void DuplicateAlarmFullReference_Throws()
{
var first = new AlarmSubtagTarget
{
AlarmFullReference = "Galaxy!Area.Tank01.Level.HiHi",
SourceObjectReference = "Tank01",
ActiveSubtag = "Tank01.Level.HiHi.active",
};
var second = new AlarmSubtagTarget
{
AlarmFullReference = "Galaxy!Area.Tank01.Level.HiHi",
SourceObjectReference = "Tank01",
ActiveSubtag = "Other.active",
};
Assert.Throws<ArgumentException>(() => new SubtagAlarmStateMachine(new[] { first, second }));
}
[Fact]
public void AckedTrueWhileActive_EmitsAck()
{
@@ -162,4 +187,64 @@ public sealed class SubtagAlarmStateMachineTests
var events = sm.Apply("Some.Other.Tag.active", true, DateTime.UtcNow);
Assert.Empty(events);
}
/// <summary>
/// Worker.Tests-033 regression: an ack arriving while the alarm is NOT
/// active must emit nothing and must NOT latch
/// <c>AckedDuringEpisode</c> — otherwise a stale ack from a prior episode
/// would mis-latch the next raise into a spurious ACK_RTN on clear. The
/// subsequent raise/clear must therefore still emit UNACK_RTN.
/// </summary>
[Fact]
public void AckedTrueWhileInactive_EmitsNothingAndDoesNotLatch()
{
var sm = new SubtagAlarmStateMachine(new[] { Target() });
var ts = new DateTime(2026, 6, 13, 9, 0, 0, DateTimeKind.Utc);
// Ack with no preceding active raise: must be a no-op.
var ackEvents = sm.Apply("Tank01.Level.HiHi.acked", true, ts);
Assert.Empty(ackEvents);
// A fresh episode: raise then clear. Because the earlier ack must not
// have latched AckedDuringEpisode, the clear must be UNACK_RTN.
sm.Apply("Tank01.Level.HiHi.active", true, ts.AddSeconds(5));
var clearEvents = sm.Apply("Tank01.Level.HiHi.active", false, ts.AddSeconds(10));
var clear = Assert.Single(clearEvents);
Assert.Equal(MxAlarmStateKind.UnackRtn, clear.Record.State);
}
/// <summary>
/// Worker.Tests-033 regression: a priority-subtag value change must flow
/// through <c>CoerceInt</c> into the emitted record's
/// <see cref="MxAlarmSnapshotRecord.Priority"/>. A non-numeric value must
/// leave the prior priority unchanged (the CoerceInt fallback path).
/// </summary>
[Fact]
public void PriorityChange_FlowsIntoEmittedRecord()
{
var target = new AlarmSubtagTarget
{
AlarmFullReference = "Galaxy!Area.Tank01.Level.HiHi",
SourceObjectReference = "Tank01",
ActiveSubtag = "Tank01.Level.HiHi.active",
AckedSubtag = "Tank01.Level.HiHi.acked",
PrioritySubtag = "Tank01.Level.HiHi.priority",
};
var sm = new SubtagAlarmStateMachine(new[] { target });
var ts = new DateTime(2026, 6, 13, 9, 0, 0, DateTimeKind.Utc);
// A priority change alone emits nothing but records the priority.
var priorityEvents = sm.Apply("Tank01.Level.HiHi.priority", 750, ts);
Assert.Empty(priorityEvents);
// Raise: the emitted record carries the recorded priority.
var raiseEvents = sm.Apply("Tank01.Level.HiHi.active", true, ts.AddSeconds(1));
var raise = Assert.Single(raiseEvents);
Assert.Equal(750, raise.Record.Priority);
// A non-numeric priority must fall back to the existing value, not zero.
sm.Apply("Tank01.Level.HiHi.priority", "not-a-number", ts.AddSeconds(2));
var snap = Assert.Single(sm.SnapshotActive());
Assert.Equal(750, snap.Priority);
}
}
@@ -24,4 +24,43 @@ public sealed class SyntheticAlarmGuidTests
[Fact]
public void Reference_ProducesNonEmptyGuid() =>
Assert.NotEqual(Guid.Empty, SyntheticAlarmGuid.ForReference("A.B.C"));
/// <summary>
/// Verifies the empty string still derives a non-empty GUID. The length
/// fold in the derivation prevents a degenerate all-zero (Guid.Empty)
/// result, which would collide with the unset-record default downstream.
/// </summary>
[Fact]
public void EmptyReference_ProducesNonEmptyGuid() =>
Assert.NotEqual(Guid.Empty, SyntheticAlarmGuid.ForReference(string.Empty));
/// <summary>
/// Worker-027 regression: <see cref="SyntheticAlarmGuid.ForReference"/>
/// must derive its GUID without routing through
/// <see cref="System.Security.Cryptography"/>, because on net48
/// <c>MD5.Create()</c> throws under the Windows FIPS-compliance policy.
/// This test enables the per-AppContext FIPS-enforcement switch (which the
/// managed crypto factories honour) and asserts the derivation still
/// succeeds deterministically — a regression that reintroduced a FIPS-gated
/// provider would throw here instead of returning a stable GUID.
/// </summary>
[Fact]
public void ForReference_UnderFipsEnforcement_DoesNotThrowAndStaysDeterministic()
{
const string switchName = "Switch.System.Security.Cryptography.UseLegacyFipsThrow";
bool original = AppContext.TryGetSwitch(switchName, out bool value) && value;
AppContext.SetSwitch(switchName, true);
try
{
Guid first = SyntheticAlarmGuid.ForReference("Galaxy!Area.Tank01.Level.HiHi");
Guid second = SyntheticAlarmGuid.ForReference("Galaxy!Area.Tank01.Level.HiHi");
Assert.NotEqual(Guid.Empty, first);
Assert.Equal(first, second);
}
finally
{
AppContext.SetSwitch(switchName, original);
}
}
}
@@ -313,12 +313,20 @@ public sealed class FailoverAlarmConsumer : IMxAccessAlarmConsumer
consecutiveFailures = 0;
cleanProbes = 0;
// Emit the mode-changed notification FIRST and in a guarded block, so
// the gateway always learns the feed went degraded even if the priming
// snapshot below throws. A handler exception here must never escape the
// switch — escaping would (a) leave `active` flipped with no
// notification and (b) unwind into RunAlarmPollLoopAsync's trailing
// catch, which permanently stops alarm polling (Worker-026).
RaiseModeChanged(AlarmProviderMode.Subtag, reason, hresult);
// Warm the standby snapshot for the gateway hand-off. The gateway
// reconciles state from this snapshot, so the return value is not
// consumed here — the call exists for its priming side effect.
_ = standby.SnapshotActiveAlarms();
RaiseModeChanged(AlarmProviderMode.Subtag, reason, hresult);
// consumed here — the call exists for its priming side effect. A
// failure to prime is non-fatal: the switch has already completed and
// been announced, and the standby's live transitions will still flow.
TryPrimeStandbySnapshot();
}
private void SwitchToPrimary(string reason, int hresult)
@@ -327,14 +335,49 @@ public sealed class FailoverAlarmConsumer : IMxAccessAlarmConsumer
mode = AlarmProviderMode.Alarmmgr;
consecutiveFailures = 0;
cleanProbes = 0;
// Guarded so a ProviderModeChanged handler exception cannot escape into
// the STA poll loop and kill alarm delivery (Worker-026).
RaiseModeChanged(AlarmProviderMode.Alarmmgr, reason, hresult);
}
/// <summary>
/// Primes the standby snapshot for the gateway hand-off, swallowing any
/// failure. The switch has already completed and the mode change has
/// already been announced before this runs, so a priming failure must
/// not abort the switch or unwind into the poll loop.
/// </summary>
private void TryPrimeStandbySnapshot()
{
try
{
_ = standby.SnapshotActiveAlarms();
}
catch (Exception ex) when (ex is not OutOfMemoryException)
{
// Non-fatal: the standby is active and its live transitions still
// flow; the gateway will reconcile from subsequent records. Do not
// let a transient snapshot failure escape and stop the poll loop.
}
}
private void RaiseModeChanged(AlarmProviderMode newMode, string reason, int hresult)
{
ProviderModeChanged?.Invoke(
this,
new AlarmProviderModeChange(newMode, reason, hresult, DateTime.UtcNow));
try
{
ProviderModeChanged?.Invoke(
this,
new AlarmProviderModeChange(newMode, reason, hresult, DateTime.UtcNow));
}
catch (Exception ex) when (ex is not OutOfMemoryException)
{
// A subscriber's OnProviderModeChanged handler threw (e.g. the
// AlarmCommandHandler's eventQueue.Enqueue hitting capacity). The
// switch itself has already taken effect; swallow so the failure
// cannot unwind into RunAlarmPollLoopAsync and permanently stop
// alarm polling (Worker-026). The event-queue overflow it most
// likely signals is already surfaced as a fault on the IPC path.
}
}
private void OnChildTransition(object? sender, MxAlarmTransitionEvent e)
@@ -42,8 +42,22 @@ public sealed class SubtagAlarmStateMachine
foreach (AlarmSubtagTarget target in targets)
{
// Guard duplicate references symmetrically with the dup-address guard
// in Bind: two watch-list entries that share an AlarmFullReference but
// differ in subtag addresses would otherwise silently overwrite the
// earlier _statesByReference entry while its addresses stay bound to an
// orphaned (and therefore invisible) AlarmState, producing silently
// inconsistent synthesized state. Fail fast at subscribe time instead.
string reference = target.AlarmFullReference ?? string.Empty;
if (_statesByReference.ContainsKey(reference))
{
throw new ArgumentException(
$"Duplicate alarm full reference '{reference}' is bound to more than one alarm target.",
nameof(targets));
}
var state = new AlarmState(target);
_statesByReference[target.AlarmFullReference] = state;
_statesByReference[reference] = state;
Bind(target.ActiveSubtag, state, SubtagRole.Active);
Bind(target.AckedSubtag, state, SubtagRole.Acked);
@@ -1,5 +1,4 @@
using System;
using System.Security.Cryptography;
using System.Text;
namespace ZB.MOM.WW.MxGateway.Worker.MxAccess;
@@ -11,8 +10,29 @@ namespace ZB.MOM.WW.MxGateway.Worker.MxAccess;
/// repeated transitions for the same alarm reference correlate downstream
/// (acknowledge, snapshot, OPC UA mapping) without an alarmmgr-supplied GUID.
/// </summary>
/// <remarks>
/// The 128-bit value is computed with a fixed FNV-1a hash over the UTF-8
/// bytes of the reference, deliberately <strong>not</strong> via
/// <c>System.Security.Cryptography</c>. On .NET Framework 4.8
/// <c>MD5.Create()</c> returns the non-validated
/// <c>MD5CryptoServiceProvider</c>, whose constructor throws under the
/// Windows FIPS-compliance policy ("not part of the Windows Platform FIPS
/// validated cryptographic algorithms"). Because this derivation needs only
/// determinism and distinctness — never cryptographic strength — a plain
/// non-crypto hash avoids the FIPS gate entirely, so the subtag fallback
/// keeps working on regulated (FIPS-enabled) hosts exactly when it is needed.
/// </remarks>
internal static class SyntheticAlarmGuid
{
// 64-bit FNV-1a constants (RFC-style; widely used reference values).
private const ulong FnvOffsetBasis = 14695981039346656037UL;
private const ulong FnvPrime = 1099511628211UL;
// A second independent seed for the high 8 bytes so the full 128-bit value
// is well-distributed across distinct references rather than two correlated
// halves of the same single-pass hash.
private const ulong FnvSecondSeed = 1469598103934665603UL;
/// <summary>
/// Produces a stable <see cref="Guid"/> for the given alarm reference.
/// The same reference always maps to the same GUID; distinct references
@@ -32,11 +52,39 @@ internal static class SyntheticAlarmGuid
byte[] bytes = Encoding.UTF8.GetBytes(reference);
// MD5 is used purely for a stable, non-cryptographic identity mapping
// (reference -> 16-byte GUID), never for security. Its 128-bit output
// fits a GUID exactly, which is why it is preferred here.
using MD5 md5 = MD5.Create();
byte[] hash = md5.ComputeHash(bytes);
return new Guid(hash);
// Two independent FNV-1a passes fill the low and high 64 bits of the
// 128-bit value. The second pass mixes the running length into its seed
// so single-character differences and re-orderings still diverge in both
// halves, avoiding correlated-half collisions a single pass would risk.
ulong low = FnvOffsetBasis;
ulong high = FnvSecondSeed;
for (int i = 0; i < bytes.Length; i++)
{
byte b = bytes[i];
low ^= b;
low *= FnvPrime;
high ^= unchecked(b + (ulong)i);
high *= FnvPrime;
}
// Fold the length in so the empty string and other short inputs are not
// degenerate (an all-zero / Guid.Empty result is undesirable downstream).
low ^= (ulong)bytes.Length;
low *= FnvPrime;
byte[] guidBytes = new byte[16];
WriteUInt64(guidBytes, 0, low);
WriteUInt64(guidBytes, 8, high);
return new Guid(guidBytes);
}
private static void WriteUInt64(byte[] buffer, int offset, ulong value)
{
for (int i = 0; i < 8; i++)
{
buffer[offset + i] = (byte)(value >> (i * 8));
}
}
}