feat(driver-galaxy): consume the gateway's session-less alarm model

The mxaccessgw updated alarms to a session-less central monitor:
AcknowledgeAlarm dropped SessionId and alarm transitions now come from
the session-less StreamAlarms feed instead of the per-session worker
StreamEvents stream. The GalaxyDriver no longer compiled against the
updated client.

- GatewayGalaxyAlarmAcknowledger: session-less rewrite — no GalaxyMxSession;
  outcome read from ProtocolStatus (throw) and Hresult (warn).
- New IGalaxyAlarmFeed seam + GatewayGalaxyAlarmFeed: background consumer
  of StreamAlarms that decodes the active-alarm snapshot plus live
  transitions into GalaxyAlarmTransition and reopens the stream on
  transport faults.
- EventPump: drop the dead per-session OnAlarmTransition path; the
  per-session stream no longer carries alarms.
- GalaxyDriver: bridge the feed onto IAlarmSource.OnAlarmEvent; the feed
  starts on SubscribeAlarmsAsync, independent of data subscriptions.
- Tests: replace EventPumpAlarmTests with GatewayGalaxyAlarmFeedTests;
  move the driver alarm-source tests onto the IGalaxyAlarmFeed seam.

Browse needed no change — GatewayGalaxyHierarchySource consumes the
unchanged DiscoverHierarchy contract.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Joseph Doherty
2026-05-22 03:59:36 -04:00
parent cd2306db66
commit 27a8d05b7c
9 changed files with 713 additions and 557 deletions
@@ -45,12 +45,6 @@ internal sealed class EventPump : IAsyncDisposable
private static readonly Counter<long> EventsDropped =
Meter.CreateCounter<long>("galaxy.events.dropped", unit: "{event}",
description: "MxEvents dropped because the bounded channel was full (newest-dropped).");
private static readonly Counter<long> AlarmTransitionsReceived =
Meter.CreateCounter<long>("galaxy.alarm_transitions.received", unit: "{event}",
description: "OnAlarmTransition events decoded and forwarded to driver-level handlers.");
private static readonly Counter<long> AlarmTransitionsDecodingFailures =
Meter.CreateCounter<long>("galaxy.alarm_transitions.decoding_failures", unit: "{event}",
description: "OnAlarmTransition events that arrived without a populated body or with an unspecified transition kind.");
private readonly IGalaxySubscriber _subscriber;
private readonly SubscriptionRegistry _registry;
@@ -66,15 +60,6 @@ internal sealed class EventPump : IAsyncDisposable
public event EventHandler<DataChangeEventArgs>? OnDataChange;
/// <summary>
/// Fires for every <see cref="MxEventFamily.OnAlarmTransition"/> event the
/// gateway forwards. Decoded into a <see cref="GalaxyAlarmTransition"/> with
/// the OPC UA severity bucket already mapped via
/// <see cref="MxAccessSeverityMapper"/>. The driver wraps this onto
/// <c>IAlarmSource.OnAlarmEvent</c> in PR B.2.
/// </summary>
internal event EventHandler<GalaxyAlarmTransition>? OnAlarmTransition;
public EventPump(
IGalaxySubscriber subscriber,
SubscriptionRegistry registry,
@@ -179,13 +164,12 @@ internal sealed class EventPump : IAsyncDisposable
case MxEventFamily.OnDataChange:
DispatchDataChange(ev);
break;
case MxEventFamily.OnAlarmTransition:
DispatchAlarmTransition(ev);
break;
default:
// OnWriteComplete / OperationComplete / OnBufferedDataChange are filtered
// out — write callers get their reply via the InvokeAsync round-trip, not
// via the event stream.
// OnAlarmTransition is no longer carried on the per-session event stream
// — alarms come from the gateway's session-less StreamAlarms feed
// (GatewayGalaxyAlarmFeed). OnWriteComplete / OperationComplete /
// OnBufferedDataChange are filtered out: write callers get their reply
// via the InvokeAsync round-trip, not via the event stream.
return;
}
}
@@ -212,73 +196,6 @@ internal sealed class EventPump : IAsyncDisposable
}
}
private void DispatchAlarmTransition(MxEvent ev)
{
// Body absent (e.g. malformed gateway event or worker version skew) — count and
// drop. The Part 9 sub-attribute fallback path keeps an alarm functional even
// when the rich payload disappears.
if (ev.OnAlarmTransition is not { } body)
{
AlarmTransitionsDecodingFailures.Add(1, _clientTag);
_logger.LogDebug(
"Galaxy OnAlarmTransition event arrived without a populated body (sequence={Sequence}); ignoring.",
ev.WorkerSequence);
return;
}
if (body.TransitionKind == AlarmTransitionKind.Unspecified)
{
AlarmTransitionsDecodingFailures.Add(1, _clientTag);
_logger.LogDebug(
"Galaxy OnAlarmTransition for {AlarmRef} has unspecified transition kind; ignoring.",
body.AlarmFullReference);
return;
}
var (bucket, opcUaSeverity) = MxAccessSeverityMapper.Map(body.Severity);
var transitionTimestamp = body.TransitionTimestamp is { } tts
? tts.ToDateTime()
: DateTime.UtcNow;
DateTime? originalRaiseTimestamp = body.OriginalRaiseTimestamp is { } orts
? orts.ToDateTime()
: null;
var transition = new GalaxyAlarmTransition(
AlarmFullReference: body.AlarmFullReference,
SourceObjectReference: body.SourceObjectReference,
AlarmTypeName: body.AlarmTypeName,
TransitionKind: MapTransitionKind(body.TransitionKind),
SeverityBucket: bucket,
OpcUaSeverity: opcUaSeverity,
RawMxAccessSeverity: body.Severity,
OriginalRaiseTimestampUtc: originalRaiseTimestamp,
TransitionTimestampUtc: transitionTimestamp,
OperatorUser: body.OperatorUser,
OperatorComment: body.OperatorComment,
Category: body.Category,
Description: body.Description);
AlarmTransitionsReceived.Add(1, _clientTag);
try
{
OnAlarmTransition?.Invoke(this, transition);
}
catch (Exception ex)
{
_logger.LogWarning(ex,
"Galaxy OnAlarmTransition handler threw for {AlarmRef} — continuing.",
transition.AlarmFullReference);
}
}
private static GalaxyAlarmTransitionKind MapTransitionKind(AlarmTransitionKind kind) => kind switch
{
AlarmTransitionKind.Raise => GalaxyAlarmTransitionKind.Raise,
AlarmTransitionKind.Acknowledge => GalaxyAlarmTransitionKind.Acknowledge,
AlarmTransitionKind.Clear => GalaxyAlarmTransitionKind.Clear,
AlarmTransitionKind.Retrigger => GalaxyAlarmTransitionKind.Retrigger,
_ => GalaxyAlarmTransitionKind.Unspecified,
};
private DataValueSnapshot ToSnapshot(MxEvent ev)
{
var value = MxValueDecoder.Decode(ev.Value);
@@ -5,26 +5,27 @@ using MxGateway.Contracts.Proto;
namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Runtime;
/// <summary>
/// Production <see cref="IGalaxyAlarmAcknowledger"/> backed by the
/// <c>MxGatewayClient.AcknowledgeAlarmAsync</c> RPC (PR E.2). Maps the
/// reply's protocol status into a thrown exception when the gateway
/// reports a non-OK condition; native MxStatus failures inside the reply
/// surface as a logged warning so operator workflows aren't blocked by a
/// transient MxAccess hiccup.
/// Production <see cref="IGalaxyAlarmAcknowledger"/> backed by the session-less
/// <c>MxGatewayClient.AcknowledgeAlarmAsync</c> RPC. The updated gateway routes
/// acknowledgement through its always-on central alarm monitor, so no worker
/// session is involved — the driver supplies only the alarm reference, comment,
/// and operator principal.
/// </summary>
/// <remarks>
/// A non-OK <see cref="ProtocolStatus"/> means the gateway never reached MXAccess
/// (transport / dispatch failure) and is surfaced as a thrown exception. A non-zero
/// native ack return code (<c>hresult</c>) means MXAccess itself rejected the ack;
/// that is logged as a warning rather than thrown so a transient MXAccess hiccup
/// doesn't block the operator workflow — the operator can retry.
/// </remarks>
internal sealed class GatewayGalaxyAlarmAcknowledger : IGalaxyAlarmAcknowledger
{
private readonly MxGatewayClient _client;
private readonly GalaxyMxSession _session;
private readonly ILogger _logger;
public GatewayGalaxyAlarmAcknowledger(
MxGatewayClient client,
GalaxyMxSession session,
ILogger logger)
public GatewayGalaxyAlarmAcknowledger(MxGatewayClient client, ILogger logger)
{
_client = client ?? throw new ArgumentNullException(nameof(client));
_session = session ?? throw new ArgumentNullException(nameof(session));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
@@ -36,15 +37,9 @@ internal sealed class GatewayGalaxyAlarmAcknowledger : IGalaxyAlarmAcknowledger
{
ArgumentException.ThrowIfNullOrEmpty(alarmFullReference);
var session = _session.Session
?? throw new InvalidOperationException(
"GatewayGalaxyAlarmAcknowledger requires a connected GalaxyMxSession; underlying gateway session is null.");
var sessionId = session.SessionId;
var reply = await _client.AcknowledgeAlarmAsync(
new AcknowledgeAlarmRequest
{
SessionId = sessionId,
ClientCorrelationId = Guid.NewGuid().ToString("N"),
AlarmFullReference = alarmFullReference,
Comment = comment ?? string.Empty,
@@ -52,14 +47,23 @@ internal sealed class GatewayGalaxyAlarmAcknowledger : IGalaxyAlarmAcknowledger
},
cancellationToken).ConfigureAwait(false);
if (reply.Status is { Success: 0 } status)
// Protocol status — the gateway failed before MXAccess saw the ack. This is a
// hard failure: the operator's request was not delivered at all.
if (reply.ProtocolStatus is { } proto && proto.Code != ProtocolStatusCode.Ok)
{
throw new InvalidOperationException(
$"Galaxy AcknowledgeAlarm for '{alarmFullReference}' failed at the gateway: "
+ $"{proto.Code} {proto.Message}");
}
// hresult is the authoritative native ack return code (0 = success). It is
// absent only on a worker protocol violation; with an OK protocol status a
// missing value is treated as success.
if (reply.HasHresult && reply.Hresult != 0)
{
// Native MxAccess rejected the ack — log but don't throw. Treat as a
// best-effort operator workflow; the operator can retry via the OPC UA
// session if necessary.
_logger.LogWarning(
"Galaxy AcknowledgeAlarm for {AlarmRef} returned MxStatus failure: category={Category} detail={Detail} text={Text}",
alarmFullReference, status.Category, status.Detail, status.DiagnosticText);
"Galaxy AcknowledgeAlarm for {AlarmRef} returned native ack failure code {Hresult}.",
alarmFullReference, reply.Hresult);
}
}
}
@@ -0,0 +1,264 @@
using System.Diagnostics.Metrics;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Logging.Abstractions;
using MxGateway.Contracts.Proto;
namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Runtime;
/// <summary>
/// Production <see cref="IGalaxyAlarmFeed"/> over the gateway's session-less
/// <c>StreamAlarms</c> RPC. The stream opens with one <see cref="ActiveAlarmSnapshot"/>
/// per currently-active alarm (the ConditionRefresh snapshot), then a
/// <c>snapshot_complete</c> sentinel, then a live <see cref="OnAlarmTransitionEvent"/>
/// for every subsequent raise / acknowledge / clear. Each message is decoded into a
/// <see cref="GalaxyAlarmTransition"/> (severity already bucketed via
/// <see cref="MxAccessSeverityMapper"/>) and surfaced on <see cref="OnAlarmTransition"/>.
/// </summary>
/// <remarks>
/// <para>
/// The feed is independent of any worker session — the gateway's always-on central
/// alarm monitor owns the AVEVA subscription. The driver previously decoded alarm
/// transitions off the per-session <c>StreamEvents</c> stream (<see cref="EventPump"/>);
/// that path was retired when the gateway moved to the session-less alarm model.
/// </para>
/// <para>
/// The stream is supplied as a factory delegate (production passes
/// <c>MxGatewayClient.StreamAlarmsAsync</c>) so tests can drive synthetic feeds.
/// Streaming RPCs are not covered by the client's unary retry pipeline, so the feed
/// owns its reconnect: on any non-cancellation stream fault it logs, waits
/// <c>reconnectDelay</c>, and re-opens. The gateway re-sends the active-alarm
/// snapshot on every re-open, so the OPC UA condition layer sees current state
/// after a reconnect.
/// </para>
/// </remarks>
internal sealed class GatewayGalaxyAlarmFeed : IGalaxyAlarmFeed
{
/// <summary>
/// Opens a <c>StreamAlarms</c> feed. Matches the method group
/// <c>MxGatewayClient.StreamAlarmsAsync</c>.
/// </summary>
internal delegate IAsyncEnumerable<AlarmFeedMessage> AlarmStreamFactory(
StreamAlarmsRequest request, CancellationToken cancellationToken);
private static readonly TimeSpan DefaultReconnectDelay = TimeSpan.FromSeconds(5);
// Shares the driver meter name so a host-level MeterListener catches feed counters
// alongside the EventPump's. Distinct Meter instance — same name is intentional.
private static readonly Meter Meter = new(EventPump.MeterName);
private static readonly Counter<long> AlarmTransitionsReceived =
Meter.CreateCounter<long>("galaxy.alarm_feed.transitions.received", unit: "{event}",
description: "Alarm feed messages decoded and forwarded to driver-level handlers.");
private static readonly Counter<long> AlarmTransitionsDecodingFailures =
Meter.CreateCounter<long>("galaxy.alarm_feed.transitions.decoding_failures", unit: "{event}",
description: "Alarm feed messages dropped for a missing body or unspecified transition kind.");
private static readonly Counter<long> AlarmFeedReconnects =
Meter.CreateCounter<long>("galaxy.alarm_feed.reconnects", unit: "{reconnect}",
description: "Times the alarm feed re-opened its StreamAlarms stream after a transport fault.");
private readonly AlarmStreamFactory _streamFactory;
private readonly ILogger _logger;
private readonly string _alarmFilterPrefix;
private readonly TimeSpan _reconnectDelay;
private readonly KeyValuePair<string, object?> _clientTag;
private readonly CancellationTokenSource _cts = new();
private Task? _loop;
private bool _disposed;
public event EventHandler<GalaxyAlarmTransition>? OnAlarmTransition;
public GatewayGalaxyAlarmFeed(
AlarmStreamFactory streamFactory,
ILogger? logger = null,
string? clientName = null,
string? alarmFilterPrefix = null,
TimeSpan? reconnectDelay = null)
{
_streamFactory = streamFactory ?? throw new ArgumentNullException(nameof(streamFactory));
_logger = logger ?? NullLogger.Instance;
_alarmFilterPrefix = alarmFilterPrefix ?? string.Empty;
_reconnectDelay = reconnectDelay ?? DefaultReconnectDelay;
_clientTag = new KeyValuePair<string, object?>("galaxy.client", clientName ?? "<unknown>");
}
public void Start()
{
ObjectDisposedException.ThrowIf(_disposed, this);
if (_loop is not null) return;
_loop = Task.Run(() => RunAsync(_cts.Token));
}
private async Task RunAsync(CancellationToken ct)
{
var firstAttempt = true;
while (!ct.IsCancellationRequested)
{
if (!firstAttempt)
{
AlarmFeedReconnects.Add(1, _clientTag);
}
firstAttempt = false;
try
{
var request = new StreamAlarmsRequest
{
ClientCorrelationId = Guid.NewGuid().ToString("N"),
AlarmFilterPrefix = _alarmFilterPrefix,
};
await foreach (var message in _streamFactory(request, ct)
.WithCancellation(ct).ConfigureAwait(false))
{
if (ct.IsCancellationRequested) break;
Dispatch(message);
}
}
catch (OperationCanceledException) when (ct.IsCancellationRequested)
{
return; // clean shutdown
}
catch (Exception ex)
{
_logger.LogWarning(ex,
"Galaxy alarm feed stream faulted — reopening in {DelaySeconds}s.",
_reconnectDelay.TotalSeconds);
}
try
{
await Task.Delay(_reconnectDelay, ct).ConfigureAwait(false);
}
catch (OperationCanceledException) when (ct.IsCancellationRequested)
{
return;
}
}
}
private void Dispatch(AlarmFeedMessage message)
{
switch (message.PayloadCase)
{
case AlarmFeedMessage.PayloadOneofCase.ActiveAlarm:
DispatchSnapshotEntry(message.ActiveAlarm);
break;
case AlarmFeedMessage.PayloadOneofCase.Transition:
DispatchTransition(message.Transition);
break;
case AlarmFeedMessage.PayloadOneofCase.SnapshotComplete:
_logger.LogDebug("Galaxy alarm feed active-alarm snapshot complete.");
break;
default:
// Empty oneof — worker / gateway version skew. Count and drop.
AlarmTransitionsDecodingFailures.Add(1, _clientTag);
break;
}
}
/// <summary>
/// Decode one entry of the initial active-alarm snapshot. Each currently-active
/// alarm is surfaced as a transition so the OPC UA Part 9 condition layer sees
/// the alarm's present state on (re)connect: an unacknowledged active alarm as
/// a <see cref="GalaxyAlarmTransitionKind.Raise"/>, an acknowledged one as a
/// <see cref="GalaxyAlarmTransitionKind.Acknowledge"/>.
/// </summary>
private void DispatchSnapshotEntry(ActiveAlarmSnapshot snapshot)
{
var kind = snapshot.CurrentState switch
{
AlarmConditionState.Active => GalaxyAlarmTransitionKind.Raise,
AlarmConditionState.ActiveAcked => GalaxyAlarmTransitionKind.Acknowledge,
AlarmConditionState.Inactive => GalaxyAlarmTransitionKind.Clear,
_ => GalaxyAlarmTransitionKind.Unspecified,
};
if (kind == GalaxyAlarmTransitionKind.Unspecified)
{
AlarmTransitionsDecodingFailures.Add(1, _clientTag);
_logger.LogDebug(
"Galaxy alarm feed snapshot entry for {AlarmRef} has unspecified condition state; ignoring.",
snapshot.AlarmFullReference);
return;
}
var (bucket, opcUaSeverity) = MxAccessSeverityMapper.Map(snapshot.Severity);
Raise(new GalaxyAlarmTransition(
AlarmFullReference: snapshot.AlarmFullReference,
SourceObjectReference: snapshot.SourceObjectReference,
AlarmTypeName: snapshot.AlarmTypeName,
TransitionKind: kind,
SeverityBucket: bucket,
OpcUaSeverity: opcUaSeverity,
RawMxAccessSeverity: snapshot.Severity,
OriginalRaiseTimestampUtc: snapshot.OriginalRaiseTimestamp?.ToDateTime(),
TransitionTimestampUtc: snapshot.LastTransitionTimestamp?.ToDateTime() ?? DateTime.UtcNow,
OperatorUser: snapshot.OperatorUser,
OperatorComment: snapshot.OperatorComment,
Category: snapshot.Category,
Description: snapshot.Description));
}
private void DispatchTransition(OnAlarmTransitionEvent body)
{
if (body.TransitionKind == AlarmTransitionKind.Unspecified)
{
AlarmTransitionsDecodingFailures.Add(1, _clientTag);
_logger.LogDebug(
"Galaxy alarm feed transition for {AlarmRef} has unspecified transition kind; ignoring.",
body.AlarmFullReference);
return;
}
var (bucket, opcUaSeverity) = MxAccessSeverityMapper.Map(body.Severity);
Raise(new GalaxyAlarmTransition(
AlarmFullReference: body.AlarmFullReference,
SourceObjectReference: body.SourceObjectReference,
AlarmTypeName: body.AlarmTypeName,
TransitionKind: MapTransitionKind(body.TransitionKind),
SeverityBucket: bucket,
OpcUaSeverity: opcUaSeverity,
RawMxAccessSeverity: body.Severity,
OriginalRaiseTimestampUtc: body.OriginalRaiseTimestamp?.ToDateTime(),
TransitionTimestampUtc: body.TransitionTimestamp?.ToDateTime() ?? DateTime.UtcNow,
OperatorUser: body.OperatorUser,
OperatorComment: body.OperatorComment,
Category: body.Category,
Description: body.Description));
}
private void Raise(GalaxyAlarmTransition transition)
{
AlarmTransitionsReceived.Add(1, _clientTag);
try
{
OnAlarmTransition?.Invoke(this, transition);
}
catch (Exception ex)
{
_logger.LogWarning(ex,
"Galaxy alarm feed OnAlarmTransition handler threw for {AlarmRef} — continuing.",
transition.AlarmFullReference);
}
}
private static GalaxyAlarmTransitionKind MapTransitionKind(AlarmTransitionKind kind) => kind switch
{
AlarmTransitionKind.Raise => GalaxyAlarmTransitionKind.Raise,
AlarmTransitionKind.Acknowledge => GalaxyAlarmTransitionKind.Acknowledge,
AlarmTransitionKind.Clear => GalaxyAlarmTransitionKind.Clear,
AlarmTransitionKind.Retrigger => GalaxyAlarmTransitionKind.Retrigger,
_ => GalaxyAlarmTransitionKind.Unspecified,
};
public async ValueTask DisposeAsync()
{
if (_disposed) return;
_disposed = true;
_cts.Cancel();
if (_loop is not null)
{
try { await _loop.ConfigureAwait(false); } catch { /* shutdown */ }
}
_cts.Dispose();
}
}
@@ -0,0 +1,29 @@
namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Runtime;
/// <summary>
/// Driver-side seam for the gateway's session-less alarm feed. Production wraps
/// <c>MxGatewayClient.StreamAlarmsAsync</c> (<see cref="GatewayGalaxyAlarmFeed"/>);
/// tests substitute a fake to drive synthetic <see cref="GalaxyAlarmTransition"/>
/// events through <see cref="GalaxyDriver"/>'s <c>IAlarmSource</c> bridge without a
/// running gateway.
/// </summary>
/// <remarks>
/// The feed is independent of any worker session — the updated gateway serves
/// alarms from an always-on central monitor, so the feed survives subscription
/// churn and reconnects its own stream on transient transport failures.
/// </remarks>
internal interface IGalaxyAlarmFeed : IAsyncDisposable
{
/// <summary>
/// Fires for every alarm transition the gateway feed delivers — both the
/// entries of the initial active-alarm snapshot and every subsequent live
/// raise / acknowledge / clear. The OPC UA severity bucket is already mapped.
/// </summary>
event EventHandler<GalaxyAlarmTransition>? OnAlarmTransition;
/// <summary>
/// Start consuming the alarm feed on a background task. Idempotent — second
/// calls are no-ops while the loop is running.
/// </summary>
void Start();
}