fix(worker): resilient failover switch; FIPS-safe synthetic GUID; dup-reference guard + tests (Worker-026..028, Worker.Tests-031..033)

This commit is contained in:
Joseph Doherty
2026-06-15 02:56:15 -04:00
parent ddf2d84fbc
commit cebe67e9bd
8 changed files with 584 additions and 20 deletions
@@ -313,12 +313,20 @@ public sealed class FailoverAlarmConsumer : IMxAccessAlarmConsumer
consecutiveFailures = 0;
cleanProbes = 0;
// Emit the mode-changed notification FIRST and in a guarded block, so
// the gateway always learns the feed went degraded even if the priming
// snapshot below throws. A handler exception here must never escape the
// switch — escaping would (a) leave `active` flipped with no
// notification and (b) unwind into RunAlarmPollLoopAsync's trailing
// catch, which permanently stops alarm polling (Worker-026).
RaiseModeChanged(AlarmProviderMode.Subtag, reason, hresult);
// Warm the standby snapshot for the gateway hand-off. The gateway
// reconciles state from this snapshot, so the return value is not
// consumed here — the call exists for its priming side effect.
_ = standby.SnapshotActiveAlarms();
RaiseModeChanged(AlarmProviderMode.Subtag, reason, hresult);
// consumed here — the call exists for its priming side effect. A
// failure to prime is non-fatal: the switch has already completed and
// been announced, and the standby's live transitions will still flow.
TryPrimeStandbySnapshot();
}
private void SwitchToPrimary(string reason, int hresult)
@@ -327,14 +335,49 @@ public sealed class FailoverAlarmConsumer : IMxAccessAlarmConsumer
mode = AlarmProviderMode.Alarmmgr;
consecutiveFailures = 0;
cleanProbes = 0;
// Guarded so a ProviderModeChanged handler exception cannot escape into
// the STA poll loop and kill alarm delivery (Worker-026).
RaiseModeChanged(AlarmProviderMode.Alarmmgr, reason, hresult);
}
/// <summary>
/// Primes the standby snapshot for the gateway hand-off, swallowing any
/// failure. The switch has already completed and the mode change has
/// already been announced before this runs, so a priming failure must
/// not abort the switch or unwind into the poll loop.
/// </summary>
private void TryPrimeStandbySnapshot()
{
try
{
_ = standby.SnapshotActiveAlarms();
}
catch (Exception ex) when (ex is not OutOfMemoryException)
{
// Non-fatal: the standby is active and its live transitions still
// flow; the gateway will reconcile from subsequent records. Do not
// let a transient snapshot failure escape and stop the poll loop.
}
}
private void RaiseModeChanged(AlarmProviderMode newMode, string reason, int hresult)
{
ProviderModeChanged?.Invoke(
this,
new AlarmProviderModeChange(newMode, reason, hresult, DateTime.UtcNow));
try
{
ProviderModeChanged?.Invoke(
this,
new AlarmProviderModeChange(newMode, reason, hresult, DateTime.UtcNow));
}
catch (Exception ex) when (ex is not OutOfMemoryException)
{
// A subscriber's OnProviderModeChanged handler threw (e.g. the
// AlarmCommandHandler's eventQueue.Enqueue hitting capacity). The
// switch itself has already taken effect; swallow so the failure
// cannot unwind into RunAlarmPollLoopAsync and permanently stop
// alarm polling (Worker-026). The event-queue overflow it most
// likely signals is already surfaced as a fault on the IPC path.
}
}
private void OnChildTransition(object? sender, MxAlarmTransitionEvent e)
@@ -42,8 +42,22 @@ public sealed class SubtagAlarmStateMachine
foreach (AlarmSubtagTarget target in targets)
{
// Guard duplicate references symmetrically with the dup-address guard
// in Bind: two watch-list entries that share an AlarmFullReference but
// differ in subtag addresses would otherwise silently overwrite the
// earlier _statesByReference entry while its addresses stay bound to an
// orphaned (and therefore invisible) AlarmState, producing silently
// inconsistent synthesized state. Fail fast at subscribe time instead.
string reference = target.AlarmFullReference ?? string.Empty;
if (_statesByReference.ContainsKey(reference))
{
throw new ArgumentException(
$"Duplicate alarm full reference '{reference}' is bound to more than one alarm target.",
nameof(targets));
}
var state = new AlarmState(target);
_statesByReference[target.AlarmFullReference] = state;
_statesByReference[reference] = state;
Bind(target.ActiveSubtag, state, SubtagRole.Active);
Bind(target.AckedSubtag, state, SubtagRole.Acked);
@@ -1,5 +1,4 @@
using System;
using System.Security.Cryptography;
using System.Text;
namespace ZB.MOM.WW.MxGateway.Worker.MxAccess;
@@ -11,8 +10,29 @@ namespace ZB.MOM.WW.MxGateway.Worker.MxAccess;
/// repeated transitions for the same alarm reference correlate downstream
/// (acknowledge, snapshot, OPC UA mapping) without an alarmmgr-supplied GUID.
/// </summary>
/// <remarks>
/// The 128-bit value is computed with a fixed FNV-1a hash over the UTF-8
/// bytes of the reference, deliberately <strong>not</strong> via
/// <c>System.Security.Cryptography</c>. On .NET Framework 4.8
/// <c>MD5.Create()</c> returns the non-validated
/// <c>MD5CryptoServiceProvider</c>, whose constructor throws under the
/// Windows FIPS-compliance policy ("not part of the Windows Platform FIPS
/// validated cryptographic algorithms"). Because this derivation needs only
/// determinism and distinctness — never cryptographic strength — a plain
/// non-crypto hash avoids the FIPS gate entirely, so the subtag fallback
/// keeps working on regulated (FIPS-enabled) hosts exactly when it is needed.
/// </remarks>
internal static class SyntheticAlarmGuid
{
// 64-bit FNV-1a constants (RFC-style; widely used reference values).
private const ulong FnvOffsetBasis = 14695981039346656037UL;
private const ulong FnvPrime = 1099511628211UL;
// A second independent seed for the high 8 bytes so the full 128-bit value
// is well-distributed across distinct references rather than two correlated
// halves of the same single-pass hash.
private const ulong FnvSecondSeed = 1469598103934665603UL;
/// <summary>
/// Produces a stable <see cref="Guid"/> for the given alarm reference.
/// The same reference always maps to the same GUID; distinct references
@@ -32,11 +52,39 @@ internal static class SyntheticAlarmGuid
byte[] bytes = Encoding.UTF8.GetBytes(reference);
// MD5 is used purely for a stable, non-cryptographic identity mapping
// (reference -> 16-byte GUID), never for security. Its 128-bit output
// fits a GUID exactly, which is why it is preferred here.
using MD5 md5 = MD5.Create();
byte[] hash = md5.ComputeHash(bytes);
return new Guid(hash);
// Two independent FNV-1a passes fill the low and high 64 bits of the
// 128-bit value. The second pass mixes the running length into its seed
// so single-character differences and re-orderings still diverge in both
// halves, avoiding correlated-half collisions a single pass would risk.
ulong low = FnvOffsetBasis;
ulong high = FnvSecondSeed;
for (int i = 0; i < bytes.Length; i++)
{
byte b = bytes[i];
low ^= b;
low *= FnvPrime;
high ^= unchecked(b + (ulong)i);
high *= FnvPrime;
}
// Fold the length in so the empty string and other short inputs are not
// degenerate (an all-zero / Guid.Empty result is undesirable downstream).
low ^= (ulong)bytes.Length;
low *= FnvPrime;
byte[] guidBytes = new byte[16];
WriteUInt64(guidBytes, 0, low);
WriteUInt64(guidBytes, 8, high);
return new Guid(guidBytes);
}
private static void WriteUInt64(byte[] buffer, int offset, ulong value)
{
for (int i = 0; i < 8; i++)
{
buffer[offset + i] = (byte)(value >> (i * 8));
}
}
}