fix(communication): resolve Communication-004..008 — Resume supervision, gRPC option wiring, address-load logging, sync dispose, flap detection

This commit is contained in:
Joseph Doherty
2026-05-16 20:58:03 -04:00
parent 3e7a3d7e31
commit 31a6995d24
12 changed files with 656 additions and 51 deletions

View File

@@ -22,6 +22,9 @@ public class DebugStreamBridgeActorTests : TestKit
{
// Use a very short reconnect delay for testing
DebugStreamBridgeActor.ReconnectDelay = TimeSpan.FromMilliseconds(100);
// Long stability window so streams are never considered "stable" mid-test
// unless a test deliberately waits it out.
DebugStreamBridgeActor.StabilityWindow = TimeSpan.FromSeconds(30);
}
private record TestContext(
@@ -264,8 +267,13 @@ public class DebugStreamBridgeActorTests : TestKit
}
[Fact]
public void Grpc_Error_Resets_RetryCount_On_Successful_Event()
public void FlappingStream_DeliveringEventsBetweenFailures_StillTerminatesAfterMaxRetries()
{
// Communication-008 regression: a stream that connects, delivers an event,
// then fails — repeatedly — must still trip MaxRetries. The retry count is
// NO LONGER reset by a received event (only by the stability window). The
// previous behaviour reset _retryCount on every event, so a flapping site
// reconnected forever and the debug session lived on indefinitely.
var ctx = CreateBridgeActor();
ctx.CommProbe.ExpectMsg<SiteEnvelope>();
@@ -275,30 +283,72 @@ public class DebugStreamBridgeActorTests : TestKit
new List<AlarmStateChanged>(),
DateTimeOffset.UtcNow);
Watch(ctx.BridgeActor);
ctx.BridgeActor.Tell(snapshot);
AwaitCondition(() => ctx.MockGrpcClient.SubscribeCalls.Count == 1, TimeSpan.FromSeconds(3));
// First error → retry 1
ctx.MockGrpcClient.SubscribeCalls[0].OnError(new Exception("Error 1"));
AwaitCondition(() => ctx.MockGrpcClient.SubscribeCalls.Count == 2, TimeSpan.FromSeconds(5));
// Simulate successful event (resets retry count)
var attrChange = new AttributeValueChanged(InstanceName, "IO", "Temp", 42.5, "Good", DateTimeOffset.UtcNow);
ctx.MockGrpcClient.SubscribeCalls[1].OnEvent(attrChange);
AwaitCondition(() => { lock (ctx.ReceivedEvents) { return ctx.ReceivedEvents.Count == 2; } },
TimeSpan.FromSeconds(3));
// Now another 3 errors should be tolerated (retry count was reset)
ctx.MockGrpcClient.SubscribeCalls[1].OnError(new Exception("Error 2"));
AwaitCondition(() => ctx.MockGrpcClient.SubscribeCalls.Count == 3, TimeSpan.FromSeconds(5));
// Flap: deliver one event then fail, three times. Each event would, under
// the old buggy logic, reset the retry budget and prevent termination.
for (var i = 0; i < 3; i++)
{
var call = ctx.MockGrpcClient.SubscribeCalls[i];
call.OnEvent(attrChange);
call.OnError(new Exception($"Flap {i + 1}"));
var expected = i + 2;
AwaitCondition(() => ctx.MockGrpcClient.SubscribeCalls.Count == expected, TimeSpan.FromSeconds(5));
}
ctx.MockGrpcClient.SubscribeCalls[2].OnError(new Exception("Error 3"));
AwaitCondition(() => ctx.MockGrpcClient.SubscribeCalls.Count == 4, TimeSpan.FromSeconds(5));
// Fourth error (after the 3 retries) must exceed MaxRetries and terminate.
ctx.MockGrpcClient.SubscribeCalls[3].OnEvent(attrChange);
ctx.MockGrpcClient.SubscribeCalls[3].OnError(new Exception("Flap 4"));
ctx.MockGrpcClient.SubscribeCalls[3].OnError(new Exception("Error 4"));
AwaitCondition(() => ctx.MockGrpcClient.SubscribeCalls.Count == 5, TimeSpan.FromSeconds(5));
ExpectTerminated(ctx.BridgeActor, TimeSpan.FromSeconds(5));
Assert.True(ctx.TerminatedFlag[0]);
}
// Still alive — 3 retries from the second failure point succeeded
[Fact]
public void RetryCount_RecoveredOnlyAfterStreamStaysStableForStabilityWindow()
{
// Communication-008: after a stream has been connected for the stability
// window, the retry budget is recovered — a later transient fault then gets
// a fresh set of retries rather than being counted against the old budget.
DebugStreamBridgeActor.StabilityWindow = TimeSpan.FromMilliseconds(300);
try
{
var ctx = CreateBridgeActor();
ctx.CommProbe.ExpectMsg<SiteEnvelope>();
var snapshot = new DebugViewSnapshot(
InstanceName,
new List<AttributeValueChanged>(),
new List<AlarmStateChanged>(),
DateTimeOffset.UtcNow);
Watch(ctx.BridgeActor);
ctx.BridgeActor.Tell(snapshot);
AwaitCondition(() => ctx.MockGrpcClient.SubscribeCalls.Count == 1, TimeSpan.FromSeconds(3));
// Two failures — but each new stream stays up long enough (the mock
// stream only completes on cancel) for the stability window to elapse
// and reset the retry budget before the next failure.
for (var i = 0; i < 5; i++)
{
Thread.Sleep(450); // exceed the 300ms stability window
ctx.MockGrpcClient.SubscribeCalls[i].OnError(new Exception($"Error {i + 1}"));
var expected = i + 2;
AwaitCondition(() => ctx.MockGrpcClient.SubscribeCalls.Count == expected, TimeSpan.FromSeconds(5));
}
// Five well-spaced failures did NOT terminate the actor because each
// reconnect recovered its retry budget after the stability window.
Assert.False(ctx.TerminatedFlag[0]);
}
finally
{
DebugStreamBridgeActor.StabilityWindow = TimeSpan.FromSeconds(30);
}
}
}

View File

@@ -0,0 +1,67 @@
using Microsoft.Extensions.Logging.Abstractions;
using Microsoft.Extensions.Options;
using NSubstitute;
using ScadaLink.Communication;
using ScadaLink.Communication.Grpc;
namespace ScadaLink.Communication.Tests.Grpc;
/// <summary>
/// Regression tests for Communication-005 — the gRPC keepalive and
/// max-stream-lifetime / max-concurrent-stream options defined on
/// <see cref="CommunicationOptions"/> must actually be applied to the
/// gRPC client and server rather than hard-coded.
/// </summary>
public class GrpcOptionsWiringTests
{
[Fact]
public void SiteStreamGrpcClient_AppliesKeepAliveFromOptions()
{
var options = new CommunicationOptions
{
GrpcKeepAlivePingDelay = TimeSpan.FromSeconds(42),
GrpcKeepAlivePingTimeout = TimeSpan.FromSeconds(7)
};
var client = new SiteStreamGrpcClient(
"http://localhost:9999", NullLogger<SiteStreamGrpcClient>.Instance, options);
Assert.Equal(TimeSpan.FromSeconds(42), client.KeepAlivePingDelay);
Assert.Equal(TimeSpan.FromSeconds(7), client.KeepAlivePingTimeout);
}
[Fact]
public void SiteStreamGrpcClientFactory_FlowsOptionsToCreatedClients()
{
var options = new CommunicationOptions
{
GrpcKeepAlivePingDelay = TimeSpan.FromSeconds(33),
GrpcKeepAlivePingTimeout = TimeSpan.FromSeconds(11)
};
using var factory = new SiteStreamGrpcClientFactory(
NullLoggerFactory.Instance, Options.Create(options));
var client = factory.GetOrCreate("site1", "http://localhost:9999");
Assert.Equal(TimeSpan.FromSeconds(33), client.KeepAlivePingDelay);
Assert.Equal(TimeSpan.FromSeconds(11), client.KeepAlivePingTimeout);
}
[Fact]
public void SiteStreamGrpcServer_BindsMaxConcurrentStreamsAndLifetimeFromOptions()
{
var options = new CommunicationOptions
{
GrpcMaxConcurrentStreams = 250,
GrpcMaxStreamLifetime = TimeSpan.FromHours(2)
};
var subscriber = Substitute.For<ISiteStreamSubscriber>();
var server = new SiteStreamGrpcServer(
subscriber, NullLogger<SiteStreamGrpcServer>.Instance, Options.Create(options));
Assert.Equal(250, server.MaxConcurrentStreams);
Assert.Equal(TimeSpan.FromHours(2), server.MaxStreamLifetime);
}
}

View File

@@ -0,0 +1,129 @@
using System.Collections.Concurrent;
using Microsoft.Extensions.Logging.Abstractions;
using ScadaLink.Communication.Grpc;
namespace ScadaLink.Communication.Tests.Grpc;
/// <summary>
/// Regression tests for Communication-007 — the factory's synchronous
/// <see cref="SiteStreamGrpcClientFactory.Dispose"/> must not block on the
/// async disposal path (sync-over-async). It must dispose each client through
/// the client's synchronous <see cref="SiteStreamGrpcClient.Dispose"/>.
/// </summary>
public class SiteStreamGrpcClientFactoryDisposeTests
{
/// <summary>
/// Test client that records whether it was disposed via the sync or async path.
/// </summary>
private sealed class TrackingClient : SiteStreamGrpcClient
{
public bool SyncDisposeCalled { get; private set; }
public bool AsyncDisposeCalled { get; private set; }
public override void Dispose() => SyncDisposeCalled = true;
public override ValueTask DisposeAsync()
{
AsyncDisposeCalled = true;
return ValueTask.CompletedTask;
}
}
/// <summary>
/// Test factory that hands out <see cref="TrackingClient"/> instances while
/// still exercising the base factory's real caching and disposal machinery.
/// </summary>
private sealed class TrackingFactory : SiteStreamGrpcClientFactory
{
private readonly ConcurrentBag<TrackingClient> _created = new();
public TrackingFactory() : base(NullLoggerFactory.Instance) { }
public IReadOnlyCollection<TrackingClient> Created => _created.ToList();
protected override SiteStreamGrpcClient CreateClient(string grpcEndpoint)
{
var client = new TrackingClient();
_created.Add(client);
return client;
}
}
[Fact]
public void Dispose_DisposesClientsSynchronously_NotViaAsyncPath()
{
var factory = new TrackingFactory();
factory.GetOrCreate("site-a", "http://localhost:5100");
factory.GetOrCreate("site-b", "http://localhost:5200");
factory.Dispose();
Assert.NotEmpty(factory.Created);
Assert.All(factory.Created, c =>
{
Assert.True(c.SyncDisposeCalled, "client should be disposed via synchronous Dispose()");
Assert.False(c.AsyncDisposeCalled, "synchronous Dispose() must not route through DisposeAsync()");
});
}
[Fact]
public void Dispose_DoesNotDeadlock_UnderSingleThreadedSynchronizationContext()
{
// A strict single-threaded SynchronizationContext: continuations posted to
// it are only pumped by the worker loop. Sync-over-async (blocking the only
// thread on an async continuation that needs that same thread) deadlocks here.
using var ctx = new SingleThreadSyncContext();
Exception? captured = null;
var done = new ManualResetEventSlim();
ctx.Post(_ =>
{
try
{
var factory = new SiteStreamGrpcClientFactory(NullLoggerFactory.Instance);
factory.GetOrCreate("site-a", "http://localhost:5100");
factory.Dispose();
}
catch (Exception ex)
{
captured = ex;
}
finally
{
done.Set();
}
}, null);
Assert.True(done.Wait(TimeSpan.FromSeconds(5)),
"factory.Dispose() did not complete — likely a sync-over-async deadlock");
Assert.Null(captured);
}
/// <summary>Minimal single-threaded synchronization context for the deadlock test.</summary>
private sealed class SingleThreadSyncContext : SynchronizationContext, IDisposable
{
private readonly BlockingCollection<(SendOrPostCallback cb, object? state)> _queue = new();
private readonly Thread _thread;
public SingleThreadSyncContext()
{
_thread = new Thread(Run) { IsBackground = true };
_thread.Start();
}
private void Run()
{
SetSynchronizationContext(this);
foreach (var (cb, state) in _queue.GetConsumingEnumerable())
cb(state);
}
public override void Post(SendOrPostCallback d, object? state) => _queue.Add((d, state));
public void Dispose()
{
_queue.CompleteAdding();
_thread.Join(TimeSpan.FromSeconds(2));
}
}
}