fix(communication): resolve Communication-002/003 — gRPC reconnect stream cleanup and subscription map safety
This commit is contained in:
@@ -8,7 +8,7 @@
|
|||||||
| Last reviewed | 2026-05-16 |
|
| Last reviewed | 2026-05-16 |
|
||||||
| Reviewer | claude-agent |
|
| Reviewer | claude-agent |
|
||||||
| Commit reviewed | `9c60592` |
|
| Commit reviewed | `9c60592` |
|
||||||
| Open findings | 10 |
|
| Open findings | 8 |
|
||||||
|
|
||||||
## Summary
|
## Summary
|
||||||
|
|
||||||
@@ -98,7 +98,7 @@ references `Communication-001`.
|
|||||||
|--|--|
|
|--|--|
|
||||||
| Severity | High |
|
| Severity | High |
|
||||||
| Category | Error handling & resilience |
|
| Category | Error handling & resilience |
|
||||||
| Status | Open |
|
| Status | Resolved |
|
||||||
| Location | `src/ScadaLink.Communication/Actors/DebugStreamBridgeActor.cs:170`, `src/ScadaLink.Communication/Actors/DebugStreamBridgeActor.cs:143` |
|
| Location | `src/ScadaLink.Communication/Actors/DebugStreamBridgeActor.cs:170`, `src/ScadaLink.Communication/Actors/DebugStreamBridgeActor.cs:143` |
|
||||||
|
|
||||||
**Description**
|
**Description**
|
||||||
@@ -126,7 +126,14 @@ the gRPC cancellation reaches the site and stops the relay actor.
|
|||||||
|
|
||||||
**Resolution**
|
**Resolution**
|
||||||
|
|
||||||
_Unresolved._
|
Resolved 2026-05-16 (commit `<pending>`). Root cause confirmed against source:
|
||||||
|
`HandleGrpcError` flipped `_useNodeA` and scheduled `OpenGrpcStream` without ever
|
||||||
|
unsubscribing the failed stream, leaving the old node's `StreamRelayActor` zombie until
|
||||||
|
TCP/keepalive timeout. Fix: `HandleGrpcError` now resolves the client for the
|
||||||
|
*previous* endpoint (before flipping `_useNodeA`) and calls `Unsubscribe(_correlationId)`
|
||||||
|
on it, so the local CTS is cancelled and gRPC cancellation reaches the still-alive site.
|
||||||
|
Regression test `DebugStreamBridgeActorTests.On_GrpcError_Unsubscribes_Old_Stream_Before_Reconnect`
|
||||||
|
fails against the pre-fix code and passes after.
|
||||||
|
|
||||||
### Communication-003 — SiteStreamGrpcClient subscription map overwritten without disposal; reconnect can cancel the wrong stream
|
### Communication-003 — SiteStreamGrpcClient subscription map overwritten without disposal; reconnect can cancel the wrong stream
|
||||||
|
|
||||||
@@ -134,7 +141,7 @@ _Unresolved._
|
|||||||
|--|--|
|
|--|--|
|
||||||
| Severity | High |
|
| Severity | High |
|
||||||
| Category | Concurrency & thread safety |
|
| Category | Concurrency & thread safety |
|
||||||
| Status | Open |
|
| Status | Resolved |
|
||||||
| Location | `src/ScadaLink.Communication/Grpc/SiteStreamGrpcClient.cs:77`, `src/ScadaLink.Communication/Grpc/SiteStreamGrpcClient.cs:106` |
|
| Location | `src/ScadaLink.Communication/Grpc/SiteStreamGrpcClient.cs:77`, `src/ScadaLink.Communication/Grpc/SiteStreamGrpcClient.cs:106` |
|
||||||
|
|
||||||
**Description**
|
**Description**
|
||||||
@@ -161,7 +168,18 @@ caller-supplied correlation ID.
|
|||||||
|
|
||||||
**Resolution**
|
**Resolution**
|
||||||
|
|
||||||
_Unresolved._
|
Resolved 2026-05-16 (commit `<pending>`). Root cause confirmed against source: the
|
||||||
|
inline `_subscriptions[correlationId] = cts` overwrote a prior CTS without
|
||||||
|
cancel/dispose (leak), and the `finally`'s `TryRemove(correlationId, out _)` removed by
|
||||||
|
key only — a racing reconnect's live CTS could be removed by the prior call's `finally`,
|
||||||
|
orphaning the live stream. Fix: extracted two internal helpers used by `SubscribeAsync`
|
||||||
|
— `RegisterSubscription` cancels+disposes any existing CTS for the correlation ID before
|
||||||
|
inserting, and `RemoveSubscription` uses the `ConcurrentDictionary.TryRemove(KeyValuePair)`
|
||||||
|
overload so it removes only the CTS that call created (mirroring `SiteStreamGrpcServer`'s
|
||||||
|
`StreamEntry` pattern). Regression tests
|
||||||
|
`SiteStreamGrpcClientTests.RegisterSubscription_ReusedCorrelationId_CancelsAndDisposesPriorCts`
|
||||||
|
and `SiteStreamGrpcClientTests.RemoveSubscription_OnlyRemovesOwnCts_NotAReplacement`
|
||||||
|
fail against the pre-fix logic and pass after.
|
||||||
|
|
||||||
### Communication-004 — Coordinator actors declare no SupervisorStrategy (design requires Resume)
|
### Communication-004 — Coordinator actors declare no SupervisorStrategy (design requires Resume)
|
||||||
|
|
||||||
|
|||||||
@@ -183,6 +183,15 @@ public class DebugStreamBridgeActor : ReceiveActor, IWithTimers
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Unsubscribe the failed stream on the *previous* endpoint before reconnecting.
|
||||||
|
// This cancels the local subscription CTS and -- where the channel is still
|
||||||
|
// alive -- propagates gRPC cancellation to the site so its SiteStreamGrpcServer
|
||||||
|
// stops the StreamRelayActor for this correlation ID, rather than leaving a
|
||||||
|
// zombie relay actor until TCP RST / keepalive eventually detects the loss.
|
||||||
|
var previousEndpoint = _useNodeA ? _grpcNodeAAddress : _grpcNodeBAddress;
|
||||||
|
var previousClient = _grpcFactory.GetOrCreate(_siteIdentifier, previousEndpoint);
|
||||||
|
previousClient.Unsubscribe(_correlationId);
|
||||||
|
|
||||||
// Flip to the other node
|
// Flip to the other node
|
||||||
_useNodeA = !_useNodeA;
|
_useNodeA = !_useNodeA;
|
||||||
|
|
||||||
|
|||||||
@@ -57,6 +57,32 @@ public class SiteStreamGrpcClient : IAsyncDisposable
|
|||||||
_subscriptions[correlationId] = cts;
|
_subscriptions[correlationId] = cts;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Registers a subscription's CancellationTokenSource for a correlation ID.
|
||||||
|
/// If an entry already exists for that correlation ID (a reconnect race where two
|
||||||
|
/// <see cref="SubscribeAsync"/> calls briefly share an ID), the prior CTS is
|
||||||
|
/// cancelled and disposed so it cannot leak. Internal for testability.
|
||||||
|
/// </summary>
|
||||||
|
internal void RegisterSubscription(string correlationId, CancellationTokenSource cts)
|
||||||
|
{
|
||||||
|
if (_subscriptions.TryGetValue(correlationId, out var prior) && !ReferenceEquals(prior, cts))
|
||||||
|
{
|
||||||
|
prior.Cancel();
|
||||||
|
prior.Dispose();
|
||||||
|
}
|
||||||
|
_subscriptions[correlationId] = cts;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Removes the subscription entry for a correlation ID only if the stored CTS is
|
||||||
|
/// exactly the one supplied. A racing replacement stream may already own the slot,
|
||||||
|
/// in which case this is a no-op. Internal for testability.
|
||||||
|
/// </summary>
|
||||||
|
internal void RemoveSubscription(string correlationId, CancellationTokenSource cts)
|
||||||
|
{
|
||||||
|
_subscriptions.TryRemove(new KeyValuePair<string, CancellationTokenSource>(correlationId, cts));
|
||||||
|
}
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Opens a server-streaming subscription for a specific instance.
|
/// Opens a server-streaming subscription for a specific instance.
|
||||||
/// This is a long-running async method; the caller launches it as a background task.
|
/// This is a long-running async method; the caller launches it as a background task.
|
||||||
@@ -74,7 +100,7 @@ public class SiteStreamGrpcClient : IAsyncDisposable
|
|||||||
throw new InvalidOperationException("Cannot subscribe on a test-only client.");
|
throw new InvalidOperationException("Cannot subscribe on a test-only client.");
|
||||||
|
|
||||||
var cts = CancellationTokenSource.CreateLinkedTokenSource(ct);
|
var cts = CancellationTokenSource.CreateLinkedTokenSource(ct);
|
||||||
_subscriptions[correlationId] = cts;
|
RegisterSubscription(correlationId, cts);
|
||||||
|
|
||||||
var request = new InstanceStreamRequest
|
var request = new InstanceStreamRequest
|
||||||
{
|
{
|
||||||
@@ -103,7 +129,8 @@ public class SiteStreamGrpcClient : IAsyncDisposable
|
|||||||
}
|
}
|
||||||
finally
|
finally
|
||||||
{
|
{
|
||||||
_subscriptions.TryRemove(correlationId, out _);
|
// Remove only our own entry -- a racing reconnect may already own the slot.
|
||||||
|
RemoveSubscription(correlationId, cts);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -159,6 +159,34 @@ public class DebugStreamBridgeActorTests : TestKit
|
|||||||
Assert.Equal("corr-1", ctx.MockGrpcClient.SubscribeCalls[1].CorrelationId);
|
Assert.Equal("corr-1", ctx.MockGrpcClient.SubscribeCalls[1].CorrelationId);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public void On_GrpcError_Unsubscribes_Old_Stream_Before_Reconnect()
|
||||||
|
{
|
||||||
|
// Communication-002 regression: a reconnect must unsubscribe the previous
|
||||||
|
// stream so the old node does not keep a zombie relay actor / subscription.
|
||||||
|
var ctx = CreateBridgeActor();
|
||||||
|
ctx.CommProbe.ExpectMsg<SiteEnvelope>();
|
||||||
|
|
||||||
|
var snapshot = new DebugViewSnapshot(
|
||||||
|
InstanceName,
|
||||||
|
new List<AttributeValueChanged>(),
|
||||||
|
new List<AlarmStateChanged>(),
|
||||||
|
DateTimeOffset.UtcNow);
|
||||||
|
|
||||||
|
ctx.BridgeActor.Tell(snapshot);
|
||||||
|
AwaitCondition(() => ctx.MockGrpcClient.SubscribeCalls.Count == 1, TimeSpan.FromSeconds(3));
|
||||||
|
|
||||||
|
// Simulate gRPC error → reconnect
|
||||||
|
ctx.MockGrpcClient.SubscribeCalls[0].OnError(new Exception("Stream broken"));
|
||||||
|
|
||||||
|
// Should resubscribe...
|
||||||
|
AwaitCondition(() => ctx.MockGrpcClient.SubscribeCalls.Count == 2, TimeSpan.FromSeconds(5));
|
||||||
|
|
||||||
|
// ...and must have unsubscribed the prior correlation ID so the old node's
|
||||||
|
// relay actor is released rather than left zombie.
|
||||||
|
Assert.Contains("corr-1", ctx.MockGrpcClient.UnsubscribedCorrelationIds);
|
||||||
|
}
|
||||||
|
|
||||||
[Fact]
|
[Fact]
|
||||||
public void After_MaxRetries_Terminates()
|
public void After_MaxRetries_Terminates()
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -176,4 +176,49 @@ public class SiteStreamGrpcClientTests
|
|||||||
Assert.True(cts1.IsCancellationRequested);
|
Assert.True(cts1.IsCancellationRequested);
|
||||||
Assert.True(cts2.IsCancellationRequested);
|
Assert.True(cts2.IsCancellationRequested);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// --- Communication-003 regression tests ---
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public void RegisterSubscription_ReusedCorrelationId_CancelsAndDisposesPriorCts()
|
||||||
|
{
|
||||||
|
// Two SubscribeAsync calls briefly sharing a correlation ID (reconnect race).
|
||||||
|
// Inserting the second must cancel + dispose the first so it does not leak.
|
||||||
|
var client = SiteStreamGrpcClient.CreateForTesting();
|
||||||
|
|
||||||
|
var first = new CancellationTokenSource();
|
||||||
|
var second = new CancellationTokenSource();
|
||||||
|
|
||||||
|
client.RegisterSubscription("corr-shared", first);
|
||||||
|
client.RegisterSubscription("corr-shared", second);
|
||||||
|
|
||||||
|
Assert.True(first.IsCancellationRequested);
|
||||||
|
// Disposed CTS throws ObjectDisposedException when its token is touched.
|
||||||
|
Assert.Throws<ObjectDisposedException>(() => _ = first.Token);
|
||||||
|
|
||||||
|
// The second (live) CTS must remain intact.
|
||||||
|
Assert.False(second.IsCancellationRequested);
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public void RemoveSubscription_OnlyRemovesOwnCts_NotAReplacement()
|
||||||
|
{
|
||||||
|
// First call's finally must NOT remove the second call's live entry.
|
||||||
|
var client = SiteStreamGrpcClient.CreateForTesting();
|
||||||
|
|
||||||
|
var first = new CancellationTokenSource();
|
||||||
|
var second = new CancellationTokenSource();
|
||||||
|
|
||||||
|
client.RegisterSubscription("corr-shared", first);
|
||||||
|
// A racing second SubscribeAsync replaces the entry.
|
||||||
|
client.RegisterSubscription("corr-shared", second);
|
||||||
|
|
||||||
|
// The first call's finally runs and tries to remove its (already-replaced) entry.
|
||||||
|
client.RemoveSubscription("corr-shared", first);
|
||||||
|
|
||||||
|
// The live (second) subscription must still be cancellable via Unsubscribe.
|
||||||
|
Assert.False(second.IsCancellationRequested);
|
||||||
|
client.Unsubscribe("corr-shared");
|
||||||
|
Assert.True(second.IsCancellationRequested);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user