fix(driver-galaxy): wire event-stream faults to the reconnect supervisor (Driver.Galaxy-001)

The ReconnectSupervisor was constructed but its trigger
ReportTransportFailure was never called. When the gateway StreamEvents
stream faulted, EventPump just logged and exited — the supervisor was
never notified, so a transient gateway drop permanently stopped
data-change notifications while GetHealth() still reported Healthy.

EventPump gains an optional onStreamFault callback invoked from its
stream-fault catch block (not on clean shutdown). GalaxyDriver wires it
to ReconnectSupervisor.ReportTransportFailure so a transport drop drives
reopen → replay.

This is the minimal fix for -001; the pump-restart-on-reopen gap remains
tracked as Driver.Galaxy-008. Regression tests cover the callback being
invoked on fault, the end-to-end supervisor reopen/replay, and that a
clean shutdown does not fire it. Driver.Galaxy suite: 206/206 pass.

Resolves code-review finding Driver.Galaxy-001 (Critical).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Joseph Doherty
2026-05-22 05:54:33 -04:00
parent 796871c210
commit 4df8737c86
4 changed files with 175 additions and 9 deletions

View File

@@ -16,9 +16,11 @@ namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Runtime;
/// </summary>
/// <remarks>
/// <para>
/// One pump per connected <see cref="GalaxyMxSession"/>. Reconnect lives in PR 4.5's
/// supervisor; on transport failure here we log + propagate so the supervisor can
/// decide whether to restart.
/// One pump per connected <see cref="GalaxyMxSession"/>. Reconnect lives in the
/// <see cref="ReconnectSupervisor"/>; on transport failure here we log and invoke
/// the optional <c>onStreamFault</c> callback so the owner (GalaxyDriver) can
/// forward the fault to <see cref="ReconnectSupervisor.ReportTransportFailure"/>
/// and the supervisor can drive reopen → replay.
/// </para>
/// <para>
/// PR 6.2 — the network-read loop and the listener-fanout loop are decoupled by a
@@ -50,6 +52,7 @@ internal sealed class EventPump : IAsyncDisposable
private readonly SubscriptionRegistry _registry;
private readonly ILogger _logger;
private readonly Func<long, ISubscriptionHandle> _handleFactory;
private readonly Action<Exception>? _onStreamFault;
private readonly Channel<MxEvent> _channel;
private readonly KeyValuePair<string, object?> _clientTag;
private readonly CancellationTokenSource _cts = new();
@@ -66,12 +69,14 @@ internal sealed class EventPump : IAsyncDisposable
ILogger? logger = null,
Func<long, ISubscriptionHandle>? handleFactory = null,
int channelCapacity = DefaultChannelCapacity,
string? clientName = null)
string? clientName = null,
Action<Exception>? onStreamFault = null)
{
_subscriber = subscriber ?? throw new ArgumentNullException(nameof(subscriber));
_registry = registry ?? throw new ArgumentNullException(nameof(registry));
_logger = logger ?? NullLogger.Instance;
_handleFactory = handleFactory ?? (id => new GalaxySubscriptionHandle(id));
_onStreamFault = onStreamFault;
if (channelCapacity < 1)
{
@@ -127,7 +132,20 @@ internal sealed class EventPump : IAsyncDisposable
catch (Exception ex)
{
_logger.LogWarning(ex,
"Galaxy EventPump loop ended with an exception — reconnect supervisor (PR 4.5) handles restart.");
"Galaxy EventPump loop ended with an exception — notifying reconnect supervisor.");
// The gw StreamEvents stream faulted. Signal the reconnect supervisor so it
// drives reopen → replay. Without this the stream silently dies and a
// transient gateway drop permanently stops data-change notifications.
if (_onStreamFault is not null)
{
try { _onStreamFault(ex); }
catch (Exception cbEx)
{
_logger.LogWarning(cbEx,
"Galaxy EventPump stream-fault callback threw — supervisor may not have been notified.");
}
}
}
finally
{