fix(driver-galaxy): wire event-stream faults to the reconnect supervisor (Driver.Galaxy-001)
The ReconnectSupervisor was constructed but its trigger ReportTransportFailure was never called. When the gateway StreamEvents stream faulted, EventPump just logged and exited — the supervisor was never notified, so a transient gateway drop permanently stopped data-change notifications while GetHealth() still reported Healthy. EventPump gains an optional onStreamFault callback invoked from its stream-fault catch block (not on clean shutdown). GalaxyDriver wires it to ReconnectSupervisor.ReportTransportFailure so a transport drop drives reopen → replay. This is the minimal fix for -001; the pump-restart-on-reopen gap remains tracked as Driver.Galaxy-008. Regression tests cover the callback being invoked on fault, the end-to-end supervisor reopen/replay, and that a clean shutdown does not fire it. Driver.Galaxy suite: 206/206 pass. Resolves code-review finding Driver.Galaxy-001 (Critical). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,118 @@
|
||||
using System.Threading.Channels;
|
||||
using MxGateway.Contracts.Proto;
|
||||
using Shouldly;
|
||||
using Xunit;
|
||||
using ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Runtime;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Tests.Runtime;
|
||||
|
||||
/// <summary>
|
||||
/// Regression coverage for Driver.Galaxy-001 (Critical): when the gw StreamEvents
|
||||
/// stream faults, the <see cref="EventPump"/> must notify the reconnect supervisor
|
||||
/// rather than silently logging and exiting. Without the <c>onStreamFault</c>
|
||||
/// hand-off a transient gateway transport drop permanently kills the event stream.
|
||||
/// </summary>
|
||||
public sealed class EventPumpStreamFaultTests
|
||||
{
|
||||
private const int WaitMs = 2_000;
|
||||
|
||||
[Fact]
|
||||
public async Task StreamFault_InvokesOnStreamFaultCallback_WithTheCause()
|
||||
{
|
||||
var subscriber = new FaultingSubscriber();
|
||||
var registry = new SubscriptionRegistry();
|
||||
var faultObserved = new TaskCompletionSource<Exception>(
|
||||
TaskCreationOptions.RunContinuationsAsynchronously);
|
||||
|
||||
await using var pump = new EventPump(
|
||||
subscriber, registry, channelCapacity: 4, clientName: "FaultTest",
|
||||
onStreamFault: ex => faultObserved.TrySetResult(ex));
|
||||
pump.Start();
|
||||
|
||||
// Drop the gw stream — RunAsync's await foreach throws.
|
||||
subscriber.FaultStream(new IOException("simulated gateway transport drop"));
|
||||
|
||||
var completed = await Task.WhenAny(faultObserved.Task, Task.Delay(WaitMs));
|
||||
completed.ShouldBe(faultObserved.Task,
|
||||
"EventPump must invoke onStreamFault when the gw StreamEvents stream faults");
|
||||
(await faultObserved.Task).ShouldBeOfType<IOException>();
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task StreamFault_DrivesReconnectSupervisorReopenReplay()
|
||||
{
|
||||
// End-to-end: a faulting EventPump wired to a real ReconnectSupervisor must
|
||||
// drive the supervisor through its reopen → replay recovery loop.
|
||||
var subscriber = new FaultingSubscriber();
|
||||
var registry = new SubscriptionRegistry();
|
||||
|
||||
var reopenRan = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously);
|
||||
var replayRan = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously);
|
||||
|
||||
using var supervisor = new ReconnectSupervisor(
|
||||
reopen: _ => { reopenRan.TrySetResult(); return Task.CompletedTask; },
|
||||
replay: _ => { replayRan.TrySetResult(); return Task.CompletedTask; },
|
||||
options: new ReconnectOptions(
|
||||
InitialBackoffOverride: TimeSpan.FromMilliseconds(5),
|
||||
MaxBackoffOverride: TimeSpan.FromMilliseconds(20)));
|
||||
|
||||
await using var pump = new EventPump(
|
||||
subscriber, registry, channelCapacity: 4, clientName: "FaultTest",
|
||||
onStreamFault: supervisor.ReportTransportFailure);
|
||||
pump.Start();
|
||||
|
||||
supervisor.CurrentState.ShouldBe(ReconnectSupervisor.State.Healthy);
|
||||
|
||||
subscriber.FaultStream(new IOException("simulated gateway transport drop"));
|
||||
|
||||
(await Task.WhenAny(reopenRan.Task, Task.Delay(WaitMs))).ShouldBe(reopenRan.Task,
|
||||
"stream fault must trigger the supervisor's reopen path");
|
||||
(await Task.WhenAny(replayRan.Task, Task.Delay(WaitMs))).ShouldBe(replayRan.Task,
|
||||
"stream fault must trigger the supervisor's replay path");
|
||||
|
||||
await supervisor.WaitForHealthyAsync(new CancellationTokenSource(WaitMs).Token);
|
||||
supervisor.IsDegraded.ShouldBeFalse();
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task CleanShutdown_DoesNotInvokeOnStreamFault()
|
||||
{
|
||||
var subscriber = new FaultingSubscriber();
|
||||
var registry = new SubscriptionRegistry();
|
||||
var faulted = false;
|
||||
|
||||
var pump = new EventPump(
|
||||
subscriber, registry, channelCapacity: 4, clientName: "FaultTest",
|
||||
onStreamFault: _ => faulted = true);
|
||||
pump.Start();
|
||||
|
||||
// Graceful disposal cancels the loop — that is OperationCanceledException,
|
||||
// not a transport fault, and must NOT trip the supervisor.
|
||||
await pump.DisposeAsync();
|
||||
|
||||
faulted.ShouldBeFalse("clean shutdown must not be reported as a transport fault");
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// <see cref="IGalaxySubscriber"/> fake whose StreamEvents stream can be faulted
|
||||
/// on demand so the EventPump's RunAsync catch path is exercised.
|
||||
/// </summary>
|
||||
private sealed class FaultingSubscriber : IGalaxySubscriber
|
||||
{
|
||||
private readonly Channel<MxEvent> _stream =
|
||||
Channel.CreateUnbounded<MxEvent>(new UnboundedChannelOptions { SingleReader = true });
|
||||
|
||||
public Task<IReadOnlyList<SubscribeResult>> SubscribeBulkAsync(
|
||||
IReadOnlyList<string> fullReferences, int bufferedUpdateIntervalMs, CancellationToken cancellationToken)
|
||||
=> Task.FromResult<IReadOnlyList<SubscribeResult>>([]);
|
||||
|
||||
public Task UnsubscribeBulkAsync(IReadOnlyList<int> itemHandles, CancellationToken cancellationToken)
|
||||
=> Task.CompletedTask;
|
||||
|
||||
public IAsyncEnumerable<MxEvent> StreamEventsAsync(CancellationToken cancellationToken)
|
||||
=> _stream.Reader.ReadAllAsync(cancellationToken);
|
||||
|
||||
/// <summary>Fault the stream so the pump's <c>await foreach</c> throws.</summary>
|
||||
public void FaultStream(Exception cause) => _stream.Writer.TryComplete(cause);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user