fix(communication): resolve Communication-004..008 — Resume supervision, gRPC option wiring, address-load logging, sync dispose, flap detection
This commit is contained in:
@@ -22,6 +22,9 @@ public class DebugStreamBridgeActorTests : TestKit
|
||||
{
|
||||
// Use a very short reconnect delay for testing
|
||||
DebugStreamBridgeActor.ReconnectDelay = TimeSpan.FromMilliseconds(100);
|
||||
// Long stability window so streams are never considered "stable" mid-test
|
||||
// unless a test deliberately waits it out.
|
||||
DebugStreamBridgeActor.StabilityWindow = TimeSpan.FromSeconds(30);
|
||||
}
|
||||
|
||||
private record TestContext(
|
||||
@@ -264,8 +267,13 @@ public class DebugStreamBridgeActorTests : TestKit
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Grpc_Error_Resets_RetryCount_On_Successful_Event()
|
||||
public void FlappingStream_DeliveringEventsBetweenFailures_StillTerminatesAfterMaxRetries()
|
||||
{
|
||||
// Communication-008 regression: a stream that connects, delivers an event,
|
||||
// then fails — repeatedly — must still trip MaxRetries. The retry count is
|
||||
// NO LONGER reset by a received event (only by the stability window). The
|
||||
// previous behaviour reset _retryCount on every event, so a flapping site
|
||||
// reconnected forever and the debug session lived on indefinitely.
|
||||
var ctx = CreateBridgeActor();
|
||||
ctx.CommProbe.ExpectMsg<SiteEnvelope>();
|
||||
|
||||
@@ -275,30 +283,72 @@ public class DebugStreamBridgeActorTests : TestKit
|
||||
new List<AlarmStateChanged>(),
|
||||
DateTimeOffset.UtcNow);
|
||||
|
||||
Watch(ctx.BridgeActor);
|
||||
ctx.BridgeActor.Tell(snapshot);
|
||||
AwaitCondition(() => ctx.MockGrpcClient.SubscribeCalls.Count == 1, TimeSpan.FromSeconds(3));
|
||||
|
||||
// First error → retry 1
|
||||
ctx.MockGrpcClient.SubscribeCalls[0].OnError(new Exception("Error 1"));
|
||||
AwaitCondition(() => ctx.MockGrpcClient.SubscribeCalls.Count == 2, TimeSpan.FromSeconds(5));
|
||||
|
||||
// Simulate successful event (resets retry count)
|
||||
var attrChange = new AttributeValueChanged(InstanceName, "IO", "Temp", 42.5, "Good", DateTimeOffset.UtcNow);
|
||||
ctx.MockGrpcClient.SubscribeCalls[1].OnEvent(attrChange);
|
||||
AwaitCondition(() => { lock (ctx.ReceivedEvents) { return ctx.ReceivedEvents.Count == 2; } },
|
||||
TimeSpan.FromSeconds(3));
|
||||
|
||||
// Now another 3 errors should be tolerated (retry count was reset)
|
||||
ctx.MockGrpcClient.SubscribeCalls[1].OnError(new Exception("Error 2"));
|
||||
AwaitCondition(() => ctx.MockGrpcClient.SubscribeCalls.Count == 3, TimeSpan.FromSeconds(5));
|
||||
// Flap: deliver one event then fail, three times. Each event would, under
|
||||
// the old buggy logic, reset the retry budget and prevent termination.
|
||||
for (var i = 0; i < 3; i++)
|
||||
{
|
||||
var call = ctx.MockGrpcClient.SubscribeCalls[i];
|
||||
call.OnEvent(attrChange);
|
||||
call.OnError(new Exception($"Flap {i + 1}"));
|
||||
var expected = i + 2;
|
||||
AwaitCondition(() => ctx.MockGrpcClient.SubscribeCalls.Count == expected, TimeSpan.FromSeconds(5));
|
||||
}
|
||||
|
||||
ctx.MockGrpcClient.SubscribeCalls[2].OnError(new Exception("Error 3"));
|
||||
AwaitCondition(() => ctx.MockGrpcClient.SubscribeCalls.Count == 4, TimeSpan.FromSeconds(5));
|
||||
// Fourth error (after the 3 retries) must exceed MaxRetries and terminate.
|
||||
ctx.MockGrpcClient.SubscribeCalls[3].OnEvent(attrChange);
|
||||
ctx.MockGrpcClient.SubscribeCalls[3].OnError(new Exception("Flap 4"));
|
||||
|
||||
ctx.MockGrpcClient.SubscribeCalls[3].OnError(new Exception("Error 4"));
|
||||
AwaitCondition(() => ctx.MockGrpcClient.SubscribeCalls.Count == 5, TimeSpan.FromSeconds(5));
|
||||
ExpectTerminated(ctx.BridgeActor, TimeSpan.FromSeconds(5));
|
||||
Assert.True(ctx.TerminatedFlag[0]);
|
||||
}
|
||||
|
||||
// Still alive — 3 retries from the second failure point succeeded
|
||||
[Fact]
|
||||
public void RetryCount_RecoveredOnlyAfterStreamStaysStableForStabilityWindow()
|
||||
{
|
||||
// Communication-008: after a stream has been connected for the stability
|
||||
// window, the retry budget is recovered — a later transient fault then gets
|
||||
// a fresh set of retries rather than being counted against the old budget.
|
||||
DebugStreamBridgeActor.StabilityWindow = TimeSpan.FromMilliseconds(300);
|
||||
try
|
||||
{
|
||||
var ctx = CreateBridgeActor();
|
||||
ctx.CommProbe.ExpectMsg<SiteEnvelope>();
|
||||
|
||||
var snapshot = new DebugViewSnapshot(
|
||||
InstanceName,
|
||||
new List<AttributeValueChanged>(),
|
||||
new List<AlarmStateChanged>(),
|
||||
DateTimeOffset.UtcNow);
|
||||
|
||||
Watch(ctx.BridgeActor);
|
||||
ctx.BridgeActor.Tell(snapshot);
|
||||
AwaitCondition(() => ctx.MockGrpcClient.SubscribeCalls.Count == 1, TimeSpan.FromSeconds(3));
|
||||
|
||||
// Two failures — but each new stream stays up long enough (the mock
|
||||
// stream only completes on cancel) for the stability window to elapse
|
||||
// and reset the retry budget before the next failure.
|
||||
for (var i = 0; i < 5; i++)
|
||||
{
|
||||
Thread.Sleep(450); // exceed the 300ms stability window
|
||||
ctx.MockGrpcClient.SubscribeCalls[i].OnError(new Exception($"Error {i + 1}"));
|
||||
var expected = i + 2;
|
||||
AwaitCondition(() => ctx.MockGrpcClient.SubscribeCalls.Count == expected, TimeSpan.FromSeconds(5));
|
||||
}
|
||||
|
||||
// Five well-spaced failures did NOT terminate the actor because each
|
||||
// reconnect recovered its retry budget after the stability window.
|
||||
Assert.False(ctx.TerminatedFlag[0]);
|
||||
}
|
||||
finally
|
||||
{
|
||||
DebugStreamBridgeActor.StabilityWindow = TimeSpan.FromSeconds(30);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -0,0 +1,67 @@
|
||||
using Microsoft.Extensions.Logging.Abstractions;
|
||||
using Microsoft.Extensions.Options;
|
||||
using NSubstitute;
|
||||
using ScadaLink.Communication;
|
||||
using ScadaLink.Communication.Grpc;
|
||||
|
||||
namespace ScadaLink.Communication.Tests.Grpc;
|
||||
|
||||
/// <summary>
|
||||
/// Regression tests for Communication-005 — the gRPC keepalive and
|
||||
/// max-stream-lifetime / max-concurrent-stream options defined on
|
||||
/// <see cref="CommunicationOptions"/> must actually be applied to the
|
||||
/// gRPC client and server rather than hard-coded.
|
||||
/// </summary>
|
||||
public class GrpcOptionsWiringTests
|
||||
{
|
||||
[Fact]
|
||||
public void SiteStreamGrpcClient_AppliesKeepAliveFromOptions()
|
||||
{
|
||||
var options = new CommunicationOptions
|
||||
{
|
||||
GrpcKeepAlivePingDelay = TimeSpan.FromSeconds(42),
|
||||
GrpcKeepAlivePingTimeout = TimeSpan.FromSeconds(7)
|
||||
};
|
||||
|
||||
var client = new SiteStreamGrpcClient(
|
||||
"http://localhost:9999", NullLogger<SiteStreamGrpcClient>.Instance, options);
|
||||
|
||||
Assert.Equal(TimeSpan.FromSeconds(42), client.KeepAlivePingDelay);
|
||||
Assert.Equal(TimeSpan.FromSeconds(7), client.KeepAlivePingTimeout);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void SiteStreamGrpcClientFactory_FlowsOptionsToCreatedClients()
|
||||
{
|
||||
var options = new CommunicationOptions
|
||||
{
|
||||
GrpcKeepAlivePingDelay = TimeSpan.FromSeconds(33),
|
||||
GrpcKeepAlivePingTimeout = TimeSpan.FromSeconds(11)
|
||||
};
|
||||
|
||||
using var factory = new SiteStreamGrpcClientFactory(
|
||||
NullLoggerFactory.Instance, Options.Create(options));
|
||||
|
||||
var client = factory.GetOrCreate("site1", "http://localhost:9999");
|
||||
|
||||
Assert.Equal(TimeSpan.FromSeconds(33), client.KeepAlivePingDelay);
|
||||
Assert.Equal(TimeSpan.FromSeconds(11), client.KeepAlivePingTimeout);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void SiteStreamGrpcServer_BindsMaxConcurrentStreamsAndLifetimeFromOptions()
|
||||
{
|
||||
var options = new CommunicationOptions
|
||||
{
|
||||
GrpcMaxConcurrentStreams = 250,
|
||||
GrpcMaxStreamLifetime = TimeSpan.FromHours(2)
|
||||
};
|
||||
|
||||
var subscriber = Substitute.For<ISiteStreamSubscriber>();
|
||||
var server = new SiteStreamGrpcServer(
|
||||
subscriber, NullLogger<SiteStreamGrpcServer>.Instance, Options.Create(options));
|
||||
|
||||
Assert.Equal(250, server.MaxConcurrentStreams);
|
||||
Assert.Equal(TimeSpan.FromHours(2), server.MaxStreamLifetime);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,129 @@
|
||||
using System.Collections.Concurrent;
|
||||
using Microsoft.Extensions.Logging.Abstractions;
|
||||
using ScadaLink.Communication.Grpc;
|
||||
|
||||
namespace ScadaLink.Communication.Tests.Grpc;
|
||||
|
||||
/// <summary>
|
||||
/// Regression tests for Communication-007 — the factory's synchronous
|
||||
/// <see cref="SiteStreamGrpcClientFactory.Dispose"/> must not block on the
|
||||
/// async disposal path (sync-over-async). It must dispose each client through
|
||||
/// the client's synchronous <see cref="SiteStreamGrpcClient.Dispose"/>.
|
||||
/// </summary>
|
||||
public class SiteStreamGrpcClientFactoryDisposeTests
|
||||
{
|
||||
/// <summary>
|
||||
/// Test client that records whether it was disposed via the sync or async path.
|
||||
/// </summary>
|
||||
private sealed class TrackingClient : SiteStreamGrpcClient
|
||||
{
|
||||
public bool SyncDisposeCalled { get; private set; }
|
||||
public bool AsyncDisposeCalled { get; private set; }
|
||||
|
||||
public override void Dispose() => SyncDisposeCalled = true;
|
||||
|
||||
public override ValueTask DisposeAsync()
|
||||
{
|
||||
AsyncDisposeCalled = true;
|
||||
return ValueTask.CompletedTask;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Test factory that hands out <see cref="TrackingClient"/> instances while
|
||||
/// still exercising the base factory's real caching and disposal machinery.
|
||||
/// </summary>
|
||||
private sealed class TrackingFactory : SiteStreamGrpcClientFactory
|
||||
{
|
||||
private readonly ConcurrentBag<TrackingClient> _created = new();
|
||||
|
||||
public TrackingFactory() : base(NullLoggerFactory.Instance) { }
|
||||
|
||||
public IReadOnlyCollection<TrackingClient> Created => _created.ToList();
|
||||
|
||||
protected override SiteStreamGrpcClient CreateClient(string grpcEndpoint)
|
||||
{
|
||||
var client = new TrackingClient();
|
||||
_created.Add(client);
|
||||
return client;
|
||||
}
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Dispose_DisposesClientsSynchronously_NotViaAsyncPath()
|
||||
{
|
||||
var factory = new TrackingFactory();
|
||||
factory.GetOrCreate("site-a", "http://localhost:5100");
|
||||
factory.GetOrCreate("site-b", "http://localhost:5200");
|
||||
|
||||
factory.Dispose();
|
||||
|
||||
Assert.NotEmpty(factory.Created);
|
||||
Assert.All(factory.Created, c =>
|
||||
{
|
||||
Assert.True(c.SyncDisposeCalled, "client should be disposed via synchronous Dispose()");
|
||||
Assert.False(c.AsyncDisposeCalled, "synchronous Dispose() must not route through DisposeAsync()");
|
||||
});
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Dispose_DoesNotDeadlock_UnderSingleThreadedSynchronizationContext()
|
||||
{
|
||||
// A strict single-threaded SynchronizationContext: continuations posted to
|
||||
// it are only pumped by the worker loop. Sync-over-async (blocking the only
|
||||
// thread on an async continuation that needs that same thread) deadlocks here.
|
||||
using var ctx = new SingleThreadSyncContext();
|
||||
Exception? captured = null;
|
||||
var done = new ManualResetEventSlim();
|
||||
|
||||
ctx.Post(_ =>
|
||||
{
|
||||
try
|
||||
{
|
||||
var factory = new SiteStreamGrpcClientFactory(NullLoggerFactory.Instance);
|
||||
factory.GetOrCreate("site-a", "http://localhost:5100");
|
||||
factory.Dispose();
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
captured = ex;
|
||||
}
|
||||
finally
|
||||
{
|
||||
done.Set();
|
||||
}
|
||||
}, null);
|
||||
|
||||
Assert.True(done.Wait(TimeSpan.FromSeconds(5)),
|
||||
"factory.Dispose() did not complete — likely a sync-over-async deadlock");
|
||||
Assert.Null(captured);
|
||||
}
|
||||
|
||||
/// <summary>Minimal single-threaded synchronization context for the deadlock test.</summary>
|
||||
private sealed class SingleThreadSyncContext : SynchronizationContext, IDisposable
|
||||
{
|
||||
private readonly BlockingCollection<(SendOrPostCallback cb, object? state)> _queue = new();
|
||||
private readonly Thread _thread;
|
||||
|
||||
public SingleThreadSyncContext()
|
||||
{
|
||||
_thread = new Thread(Run) { IsBackground = true };
|
||||
_thread.Start();
|
||||
}
|
||||
|
||||
private void Run()
|
||||
{
|
||||
SetSynchronizationContext(this);
|
||||
foreach (var (cb, state) in _queue.GetConsumingEnumerable())
|
||||
cb(state);
|
||||
}
|
||||
|
||||
public override void Post(SendOrPostCallback d, object? state) => _queue.Add((d, state));
|
||||
|
||||
public void Dispose()
|
||||
{
|
||||
_queue.CompleteAdding();
|
||||
_thread.Join(TimeSpan.FromSeconds(2));
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user