fix(communication): resolve Communication-002/003 — gRPC reconnect stream cleanup and subscription map safety

This commit is contained in:
Joseph Doherty
2026-05-16 19:33:09 -04:00
parent 87f14c190a
commit 301e7fb854
5 changed files with 134 additions and 7 deletions

View File

@@ -183,6 +183,15 @@ public class DebugStreamBridgeActor : ReceiveActor, IWithTimers
return;
}
// Unsubscribe the failed stream on the *previous* endpoint before reconnecting.
// This cancels the local subscription CTS and -- where the channel is still
// alive -- propagates gRPC cancellation to the site so its SiteStreamGrpcServer
// stops the StreamRelayActor for this correlation ID, rather than leaving a
// zombie relay actor until TCP RST / keepalive eventually detects the loss.
var previousEndpoint = _useNodeA ? _grpcNodeAAddress : _grpcNodeBAddress;
var previousClient = _grpcFactory.GetOrCreate(_siteIdentifier, previousEndpoint);
previousClient.Unsubscribe(_correlationId);
// Flip to the other node
_useNodeA = !_useNodeA;

View File

@@ -57,6 +57,32 @@ public class SiteStreamGrpcClient : IAsyncDisposable
_subscriptions[correlationId] = cts;
}
/// <summary>
/// Registers a subscription's CancellationTokenSource for a correlation ID.
/// If an entry already exists for that correlation ID (a reconnect race where two
/// <see cref="SubscribeAsync"/> calls briefly share an ID), the prior CTS is
/// cancelled and disposed so it cannot leak. Internal for testability.
/// </summary>
internal void RegisterSubscription(string correlationId, CancellationTokenSource cts)
{
if (_subscriptions.TryGetValue(correlationId, out var prior) && !ReferenceEquals(prior, cts))
{
prior.Cancel();
prior.Dispose();
}
_subscriptions[correlationId] = cts;
}
/// <summary>
/// Removes the subscription entry for a correlation ID only if the stored CTS is
/// exactly the one supplied. A racing replacement stream may already own the slot,
/// in which case this is a no-op. Internal for testability.
/// </summary>
internal void RemoveSubscription(string correlationId, CancellationTokenSource cts)
{
_subscriptions.TryRemove(new KeyValuePair<string, CancellationTokenSource>(correlationId, cts));
}
/// <summary>
/// Opens a server-streaming subscription for a specific instance.
/// This is a long-running async method; the caller launches it as a background task.
@@ -74,7 +100,7 @@ public class SiteStreamGrpcClient : IAsyncDisposable
throw new InvalidOperationException("Cannot subscribe on a test-only client.");
var cts = CancellationTokenSource.CreateLinkedTokenSource(ct);
_subscriptions[correlationId] = cts;
RegisterSubscription(correlationId, cts);
var request = new InstanceStreamRequest
{
@@ -103,7 +129,8 @@ public class SiteStreamGrpcClient : IAsyncDisposable
}
finally
{
_subscriptions.TryRemove(correlationId, out _);
// Remove only our own entry -- a racing reconnect may already own the slot.
RemoveSubscription(correlationId, cts);
}
}