fix(communication): resolve Communication-004..008 — Resume supervision, gRPC option wiring, address-load logging, sync dispose, flap detection

This commit is contained in:
Joseph Doherty
2026-05-16 20:58:03 -04:00
parent 3e7a3d7e31
commit 31a6995d24
12 changed files with 656 additions and 51 deletions

View File

@@ -22,6 +22,9 @@ public class DebugStreamBridgeActorTests : TestKit
{
// Use a very short reconnect delay for testing
DebugStreamBridgeActor.ReconnectDelay = TimeSpan.FromMilliseconds(100);
// Long stability window so streams are never considered "stable" mid-test
// unless a test deliberately waits it out.
DebugStreamBridgeActor.StabilityWindow = TimeSpan.FromSeconds(30);
}
private record TestContext(
@@ -264,8 +267,13 @@ public class DebugStreamBridgeActorTests : TestKit
}
[Fact]
public void Grpc_Error_Resets_RetryCount_On_Successful_Event()
public void FlappingStream_DeliveringEventsBetweenFailures_StillTerminatesAfterMaxRetries()
{
// Communication-008 regression: a stream that connects, delivers an event,
// then fails — repeatedly — must still trip MaxRetries. The retry count is
// NO LONGER reset by a received event (only by the stability window). The
// previous behaviour reset _retryCount on every event, so a flapping site
// reconnected forever and the debug session lived on indefinitely.
var ctx = CreateBridgeActor();
ctx.CommProbe.ExpectMsg<SiteEnvelope>();
@@ -275,30 +283,72 @@ public class DebugStreamBridgeActorTests : TestKit
new List<AlarmStateChanged>(),
DateTimeOffset.UtcNow);
Watch(ctx.BridgeActor);
ctx.BridgeActor.Tell(snapshot);
AwaitCondition(() => ctx.MockGrpcClient.SubscribeCalls.Count == 1, TimeSpan.FromSeconds(3));
// First error → retry 1
ctx.MockGrpcClient.SubscribeCalls[0].OnError(new Exception("Error 1"));
AwaitCondition(() => ctx.MockGrpcClient.SubscribeCalls.Count == 2, TimeSpan.FromSeconds(5));
// Simulate successful event (resets retry count)
var attrChange = new AttributeValueChanged(InstanceName, "IO", "Temp", 42.5, "Good", DateTimeOffset.UtcNow);
ctx.MockGrpcClient.SubscribeCalls[1].OnEvent(attrChange);
AwaitCondition(() => { lock (ctx.ReceivedEvents) { return ctx.ReceivedEvents.Count == 2; } },
TimeSpan.FromSeconds(3));
// Now another 3 errors should be tolerated (retry count was reset)
ctx.MockGrpcClient.SubscribeCalls[1].OnError(new Exception("Error 2"));
AwaitCondition(() => ctx.MockGrpcClient.SubscribeCalls.Count == 3, TimeSpan.FromSeconds(5));
// Flap: deliver one event then fail, three times. Each event would, under
// the old buggy logic, reset the retry budget and prevent termination.
for (var i = 0; i < 3; i++)
{
var call = ctx.MockGrpcClient.SubscribeCalls[i];
call.OnEvent(attrChange);
call.OnError(new Exception($"Flap {i + 1}"));
var expected = i + 2;
AwaitCondition(() => ctx.MockGrpcClient.SubscribeCalls.Count == expected, TimeSpan.FromSeconds(5));
}
ctx.MockGrpcClient.SubscribeCalls[2].OnError(new Exception("Error 3"));
AwaitCondition(() => ctx.MockGrpcClient.SubscribeCalls.Count == 4, TimeSpan.FromSeconds(5));
// Fourth error (after the 3 retries) must exceed MaxRetries and terminate.
ctx.MockGrpcClient.SubscribeCalls[3].OnEvent(attrChange);
ctx.MockGrpcClient.SubscribeCalls[3].OnError(new Exception("Flap 4"));
ctx.MockGrpcClient.SubscribeCalls[3].OnError(new Exception("Error 4"));
AwaitCondition(() => ctx.MockGrpcClient.SubscribeCalls.Count == 5, TimeSpan.FromSeconds(5));
ExpectTerminated(ctx.BridgeActor, TimeSpan.FromSeconds(5));
Assert.True(ctx.TerminatedFlag[0]);
}
// Still alive — 3 retries from the second failure point succeeded
[Fact]
public void RetryCount_RecoveredOnlyAfterStreamStaysStableForStabilityWindow()
{
// Communication-008: after a stream has been connected for the stability
// window, the retry budget is recovered — a later transient fault then gets
// a fresh set of retries rather than being counted against the old budget.
DebugStreamBridgeActor.StabilityWindow = TimeSpan.FromMilliseconds(300);
try
{
var ctx = CreateBridgeActor();
ctx.CommProbe.ExpectMsg<SiteEnvelope>();
var snapshot = new DebugViewSnapshot(
InstanceName,
new List<AttributeValueChanged>(),
new List<AlarmStateChanged>(),
DateTimeOffset.UtcNow);
Watch(ctx.BridgeActor);
ctx.BridgeActor.Tell(snapshot);
AwaitCondition(() => ctx.MockGrpcClient.SubscribeCalls.Count == 1, TimeSpan.FromSeconds(3));
// Two failures — but each new stream stays up long enough (the mock
// stream only completes on cancel) for the stability window to elapse
// and reset the retry budget before the next failure.
for (var i = 0; i < 5; i++)
{
Thread.Sleep(450); // exceed the 300ms stability window
ctx.MockGrpcClient.SubscribeCalls[i].OnError(new Exception($"Error {i + 1}"));
var expected = i + 2;
AwaitCondition(() => ctx.MockGrpcClient.SubscribeCalls.Count == expected, TimeSpan.FromSeconds(5));
}
// Five well-spaced failures did NOT terminate the actor because each
// reconnect recovered its retry budget after the stability window.
Assert.False(ctx.TerminatedFlag[0]);
}
finally
{
DebugStreamBridgeActor.StabilityWindow = TimeSpan.FromSeconds(30);
}
}
}